diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 4a96e244188bb6c7d68987d34696fff392e2c997..c6fe991b9761d5ef20af649f54224b03f2dd7fe8 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,28 @@
 # Change Log
 
+## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
+
+**Implemented enhancements:**
+
+- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
+- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
+- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
+- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
+- Turning off IVDEP for Intel 14.  [\#638](https://github.com/kokkos/kokkos/issues/638)
+- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
+- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
+- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
+- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
+
+**Fixed bugs:**
+
+- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
+- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
+- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
+- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
+- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
+
 ## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)
 
diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
index 16854c839a044e5da9084d2a1a7eeb4360ab0327..1c820660ae375006e83bd50c0d4bbd8472ed0258 100644
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@@ -98,10 +98,10 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
   )
 
 TRIBITS_ADD_OPTION_AND_DEFINE(
-  Kokkos_ENABLE_QTHREAD
-  KOKKOS_HAVE_QTHREAD
-  "Enable QTHREAD support in Kokkos."
-  "${TPL_ENABLE_QTHREAD}"
+  Kokkos_ENABLE_Qthreads
+  KOKKOS_HAVE_QTHREADS
+  "Enable Qthreads support in Kokkos."
+  "${TPL_ENABLE_QTHREADS}"
   )
 
 TRIBITS_ADD_OPTION_AND_DEFINE(
@@ -110,7 +110,7 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
   "Enable C++11 support in Kokkos."
   "${${PROJECT_NAME}_ENABLE_CXX11}"
   )
-  
+
 TRIBITS_ADD_OPTION_AND_DEFINE(
   Kokkos_ENABLE_HWLOC
   KOKKOS_HAVE_HWLOC
@@ -213,4 +213,3 @@ TRIBITS_EXCLUDE_FILES(
   )
 
 TRIBITS_PACKAGE_POSTPROCESS()
-
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 9d00c19027a37387888d9f0265c7cdfecb45cc56..5b094dba8cb786c94c9119a5865fcc0dadf9a76f 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -1,39 +1,38 @@
-# Default settings common options
+# Default settings common options.
 
 #LAMMPS specific settings:
 KOKKOS_PATH=../../lib/kokkos
 CXXFLAGS=$(CCFLAGS)
 
-#Options: OpenMP,Serial,Pthreads,Cuda
+# Options: Cuda,OpenMP,Pthreads,Qthreads,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
-#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
+# Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
 KOKKOS_ARCH ?= ""
-#Options: yes,no
+# Options: yes,no
 KOKKOS_DEBUG ?= "no"
-#Options: hwloc,librt,experimental_memkind
+# Options: hwloc,librt,experimental_memkind
 KOKKOS_USE_TPLS ?= ""
-#Options: c++11,c++1z
+# Options: c++11,c++1z
 KOKKOS_CXX_STANDARD ?= "c++11"
-#Options: aggressive_vectorization,disable_profiling
+# Options: aggressive_vectorization,disable_profiling
 KOKKOS_OPTIONS ?= ""
 
-#Default settings specific options
-#Options: force_uvm,use_ldg,rdc,enable_lambda
+# Default settings specific options.
+# Options: force_uvm,use_ldg,rdc,enable_lambda
 KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
 
-# Check for general settings
-
+# Check for general settings.
 KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))
 
-# Check for external libraries
+# Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
 KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
 KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
 
-# Check for advanced settings
+# Check for advanced settings.
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
@@ -41,21 +40,21 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | gr
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
 
-# Check for Kokkos Host Execution Spaces one of which must be on
-
+# Check for Kokkos Host Execution Spaces one of which must be on.
 KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
 KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
+KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l))
 KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
-KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
-	KOKKOS_INTERNAL_USE_SERIAL := 1
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
+  KOKKOS_INTERNAL_USE_SERIAL := 1
+endif
 endif
 endif
 
-# Check for other Execution Spaces
-
+# Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@@ -64,27 +63,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
 endif
 
-# Check OS
-
+# Check OS.
 KOKKOS_OS                      := $(shell uname -s)
 KOKKOS_INTERNAL_OS_CYGWIN      := $(shell uname -s | grep CYGWIN | wc -l)
 KOKKOS_INTERNAL_OS_LINUX       := $(shell uname -s | grep Linux  | wc -l)
 KOKKOS_INTERNAL_OS_DARWIN      := $(shell uname -s | grep Darwin | wc -l)
 
-# Check compiler
-
-KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version        2>&1 | grep "Intel Corporation" | wc -l)
-KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version        2>&1 | grep PGI   | wc -l)
-KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion        2>&1 | grep XL    | wc -l)
-KOKKOS_INTERNAL_COMPILER_CRAY  := $(shell $(CXX) -craype-verbose  2>&1 | grep "CC-" | wc -l)
-KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(CXX) --version        2>&1 | grep "nvcc" | wc -l)
+# Check compiler.
+KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version       2>&1 | grep "Intel Corporation" | wc -l)
+KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version       2>&1 | grep PGI                 | wc -l)
+KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l)
+KOKKOS_INTERNAL_COMPILER_CRAY  := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l)
+KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(CXX) --version       2>&1 | grep "nvcc"              | wc -l)
 ifneq ($(OMPI_CXX),)
   KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(OMPI_CXX) --version   2>&1 | grep "nvcc" | wc -l)
 endif
 ifneq ($(MPICH_CXX),)
   KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(MPICH_CXX) --version  2>&1 | grep "nvcc" | wc -l)
 endif
-KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version        2>&1 | grep "clang" | wc -l)
+KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version       2>&1 | grep "clang"             | wc -l)
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
   KOKKOS_INTERNAL_COMPILER_CLANG = 1
@@ -95,17 +92,17 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
   KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
+
   ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
-      $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)    
+      $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
     endif
     KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1
   endif
 endif
 
-
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-  KOKKOS_INTERNAL_OPENMP_FLAG := -mp 
+  KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@@ -114,7 +111,7 @@ else
       KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
     else
       ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-        # OpenMP is turned on by default in Cray compiler environment
+        # OpenMP is turned on by default in Cray compiler environment.
         KOKKOS_INTERNAL_OPENMP_FLAG :=
       else
         KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
@@ -138,9 +135,9 @@ else
   endif
 endif
 
-# Check for Kokkos Architecture settings
+# Check for Kokkos Architecture settings.
 
-#Intel based
+# Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
@@ -148,8 +145,8 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW |
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
 
-#NVIDIA based
-NVCC_WRAPPER :=  $(KOKKOS_PATH)/config/nvcc_wrapper
+# NVIDIA based.
+NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
@@ -170,46 +167,46 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
-KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
-KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
-endif
-
-#ARM based
+  KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
+  KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
+  KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
+endif
+
+# ARM based.
 KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))
 
-#IBM based
+# IBM based.
 KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power9 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
 
-#AMD based
+# AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
 
-#Any AVX?
+# Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 
-# Decide what ISA level we are able to support
-KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
-KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
-KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
+# Decide what ISA level we are able to support.
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
+KOKKOS_INTERNAL_USE_ISA_KNC       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
+KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
 
-#Incompatible flags?
+# Incompatible flags?
 KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
 
@@ -220,7 +217,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
   $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
 endif
 
-#Generating the list of Flags
+# Generating the list of Flags.
 
 KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
 
@@ -233,98 +230,96 @@ KOKKOS_CXXFLAGS =
 
 KOKKOS_LIBS = -lkokkos -ldl
 KOKKOS_LDFLAGS = -L$(shell pwd)
-KOKKOS_SRC = 
+KOKKOS_SRC =
 KOKKOS_HEADERS =
 
-#Generating the KokkosCore_config.h file
+# Generating the KokkosCore_config.h file.
 
 tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
 tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
 tmp := $(shell date >> KokkosCore_config.tmp)
 tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
 
-
 tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) 
+  tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREADS 1" >> KokkosCore_config.tmp )
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
-	tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
-  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
-	tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
-  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
-	tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
-  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
-	KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib 
-	tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif
 
 tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
-        KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
-        tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
-        tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-	KOKKOS_CXXFLAGS += -lineinfo
+  KOKKOS_CXXFLAGS += -lineinfo
 endif
-	KOKKOS_CXXFLAGS += -g 
-	KOKKOS_LDFLAGS += -g -ldl
-	tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += -g
+  KOKKOS_LDFLAGS += -g -ldl
+  tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
-	KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
-	KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib 
-        KOKKOS_LIBS += -lhwloc
-	tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
+  KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
+  KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
+  KOKKOS_LIBS += -lhwloc
+  tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
-	tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
   tmp := $(shell echo "\#define KOKKOSP_ENABLE_RTLIB 1" >> KokkosCore_config.tmp )
-	KOKKOS_LIBS += -lrt
+  KOKKOS_LIBS += -lrt
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
-  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib 
-        KOKKOS_LIBS += -lmemkind
+  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
+  KOKKOS_LIBS += -lmemkind
   tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif
 
@@ -341,262 +336,286 @@ endif
 tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += --relocatable-device-code=true
-	KOKKOS_LDFLAGS += --relocatable-device-code=true
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += --relocatable-device-code=true
+  KOKKOS_LDFLAGS += --relocatable-device-code=true
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
     ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -expt-extended-lambda
+      tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
+      KOKKOS_CXXFLAGS += -expt-extended-lambda
     else
       $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
     endif
   endif
+
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
     tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
   endif
 endif
+
 endif
 
-#Add Architecture flags
+# Add Architecture flags.
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-	KOKKOS_CXXFLAGS +=
-	KOKKOS_LDFLAGS +=
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
     else
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-		KOKKOS_CXXFLAGS +=
-		KOKKOS_LDFLAGS +=
-	else
-		KOKKOS_CXXFLAGS += -march=armv8-a
-		KOKKOS_LDFLAGS += -march=armv8-a
-	endif
+      KOKKOS_CXXFLAGS += -march=armv8-a
+      KOKKOS_LDFLAGS += -march=armv8-a
     endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-	KOKKOS_CXXFLAGS +=
-	KOKKOS_LDFLAGS +=
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
     else
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-		KOKKOS_CXXFLAGS +=
-		KOKKOS_LDFLAGS +=
-	else
-		KOKKOS_CXXFLAGS += -march=armv8.1-a
-		KOKKOS_LDFLAGS += -march=armv8.1-a
-	endif
+      KOKKOS_CXXFLAGS += -march=armv8.1-a
+      KOKKOS_LDFLAGS += -march=armv8.1-a
     endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-	KOKKOS_CXXFLAGS +=
-	KOKKOS_LDFLAGS +=
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
     else
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-		KOKKOS_CXXFLAGS +=
-		KOKKOS_LDFLAGS +=
-	else
-		KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
-		KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
-	endif
+      KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
+      KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
     endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -mavx
-		KOKKOS_LDFLAGS  += -mavx
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
-				KOKKOS_CXXFLAGS += -tp=sandybridge
-				KOKKOS_LDFLAGS  += -tp=sandybridge
-			else
-				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -mavx
-				KOKKOS_LDFLAGS  += -mavx
-			endif
-		endif
-	endif
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -mavx
+    KOKKOS_LDFLAGS  += -mavx
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=sandybridge
+        KOKKOS_LDFLAGS  += -tp=sandybridge
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -mavx
+        KOKKOS_LDFLAGS  += -mavx
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
+  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-	else
-		# Assume that this is a really a GNU compiler or it could be XL on P8
-		KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-		KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
-	endif
+  else
+    # Assume that this is a really a GNU compiler or it could be XL on P8.
+    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
+  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-	else
-		# Assume that this is a really a GNU compiler or it could be XL on P9
-		KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
-		KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
-	endif
+  else
+    # Assume that this is a really a GNU compiler or it could be XL on P9.
+    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -xCORE-AVX2
-		KOKKOS_LDFLAGS  += -xCORE-AVX2
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
-				KOKKOS_CXXFLAGS += -tp=haswell
-				KOKKOS_LDFLAGS  += -tp=haswell
-			else
-				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
-				KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
-			endif
-		endif
-	endif
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX2
+    KOKKOS_LDFLAGS  += -xCORE-AVX2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=haswell
+        KOKKOS_LDFLAGS  += -tp=haswell
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
+        KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -xMIC-AVX512
-		KOKKOS_LDFLAGS  += -xMIC-AVX512
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xMIC-AVX512
+    KOKKOS_LDFLAGS  += -xMIC-AVX512
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    else
+       ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-			else
-				# Asssume that this is really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=knl
-				KOKKOS_LDFLAGS  += -march=knl
-			endif
-		endif
-	endif
+      else
+        # Asssume that this is really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=knl
+        KOKKOS_LDFLAGS  += -march=knl
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -xCORE-AVX512
-		KOKKOS_LDFLAGS  += -xCORE-AVX512
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX512
+    KOKKOS_LDFLAGS  += -xCORE-AVX512
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-			else
-				# Nothing here yet
-				KOKKOS_CXXFLAGS += -march=skylake-avx512
-				KOKKOS_LDFLAGS  += -march=skylake-avx512
-			endif
-		endif
-	endif
+      else
+        # Nothing here yet.
+        KOKKOS_CXXFLAGS += -march=skylake-avx512
+        KOKKOS_LDFLAGS  += -march=skylake-avx512
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -mmic
-	KOKKOS_LDFLAGS += -mmic
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += -mmic
+  KOKKOS_LDFLAGS += -mmic
 endif
 
-#Figure out the architecture flag for Cuda
+# Figure out the architecture flag for Cuda.
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
   KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
 endif
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
+  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=--cuda-gpu-arch
+  KOKKOS_CXXFLAGS += -x cuda
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
-        KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
-        KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
 endif
+
 endif
- 
+
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
-KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
 else
-KOKKOS_INTERNAL_NEW_CONFIG := 1
+  KOKKOS_INTERNAL_NEW_CONFIG := 1
 endif
 
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
-	tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
+  tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
 endif
 
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
@@ -609,53 +628,57 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
 KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
-	KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
-	KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 
-	KOKKOS_LIBS += -lcudart -lcuda
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+  KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
+  KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
+  KOKKOS_LIBS += -lcudart -lcuda
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	KOKKOS_LIBS += -lpthread
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  else
+    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  endif
+
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_LIBS += -lqthread
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+  KOKKOS_LIBS += -lpthread
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-		KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
-	else
-		KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
-	endif
-	KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
+  KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
+  KOKKOS_LIBS += -lqthread
 endif
 
-#Explicitly set the GCC Toolchain for Clang
+# Explicitly set the GCC Toolchain for Clang.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
-    KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
-    KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC
-    KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
+  KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
+  KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
+  KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC
+  KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
 endif
 
-#With Cygwin functions such as fdopen and fileno are not defined 
-#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
-#though. So we hard undefine it here. Not sure if that has any bad side effects
-#This is needed for gtest actually, not for Kokkos itself!
+# With Cygwin functions such as fdopen and fileno are not defined
+# when strict ansi is enabled. strict ansi gets enabled with --std=c++11
+# though. So we hard undefine it here. Not sure if that has any bad side effects
+# This is needed for gtest actually, not for Kokkos itself!
 ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
   KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
 endif
 
-# Setting up dependencies
+# Setting up dependencies.
 
 KokkosCore_config.h:
 
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index a48a5f6eb7ea78712b3f6caf695745b4ef18c043..54cacb741b4f35a0033d8de0e57ded9d4dab0a00 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -18,6 +18,8 @@ Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
 Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -43,11 +45,11 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-Kokkos_QthreadExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
-Kokkos_Qthread_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@@ -59,4 +61,3 @@ endif
 
 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
-
diff --git a/lib/kokkos/README b/lib/kokkos/README
index 7ebde23a1fdbc0bff5f62c025e890b204edec591..257a2e5db475dea8c89f1468c42432614c909762 100644
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@@ -45,31 +45,39 @@ Primary tested compilers on X86 are:
   GCC 4.8.4
   GCC 4.9.2
   GCC 5.1.0
+  GCC 5.2.0
   Intel 14.0.4
   Intel 15.0.2
   Intel 16.0.1
   Intel 17.0.098
+  Intel 17.1.132
   Clang 3.5.2
   Clang 3.6.1
+  Clang 3.7.1
+  Clang 3.8.1
   Clang 3.9.0
+  PGI 17.1
 
 Primary tested compilers on Power 8 are:
   GCC 5.4.0 (OpenMP,Serial)
   IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)
 
 Primary tested compilers on Intel KNL are:
+   GCC 6.2.0
    Intel 16.2.181 (with gcc 4.7.2)
    Intel 17.0.098 (with gcc 4.7.2)
+   Intel 17.1.132 (with gcc 4.9.3)
+   Intel 17.2.174 (with gcc 4.9.3)
+   Intel 18.0.061 (beta) (with gcc 4.9.3)
 
 Secondary tested compilers are:
-  CUDA 7.0 (with gcc 4.7.2)
-  CUDA 7.5 (with gcc 4.7.2)
+  CUDA 7.0 (with gcc 4.8.4)
+  CUDA 7.5 (with gcc 4.8.4)
   CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
   CUDA/Clang 8.0 using Clang/Trunk compiler
 
 Other compilers working:
   X86:
-   PGI 15.4
    Cygwin 2.1.0 64bit with gcc 4.9.3
 
 Known non-working combinations:
diff --git a/lib/kokkos/algorithms/cmake/Dependencies.cmake b/lib/kokkos/algorithms/cmake/Dependencies.cmake
index 1d71d8af341181f689a6a8bf63036b67584cb138..c36b62523fadb628e970b6eccf57a9caaa317f1e 100644
--- a/lib/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
   LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
   TEST_OPTIONAL_TPLS CUSPARSE
   )
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index d376173bf183615e29f66bbecf6bd42cd1134a9e..bd73582362eed46161ee0ac0cf36fec4d5178129 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -547,7 +547,7 @@ namespace Kokkos {
 
     KOKKOS_INLINE_FUNCTION
     Random_XorShift64 (uint64_t state, int state_idx = 0)
-     : state_(state),state_idx_(state_idx){}
+     : state_(state==0?uint64_t(1318319):state),state_idx_(state_idx){}
 
     KOKKOS_INLINE_FUNCTION
     uint32_t urand() {
@@ -719,6 +719,9 @@ namespace Kokkos {
     }
 
     void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
+
       num_states_ = num_states;
 
       locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
@@ -968,8 +971,9 @@ namespace Kokkos {
 
     inline
     void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
       num_states_ = num_states;
-
       locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
       state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
       p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
index 5b8c65fee1869c25681567036314d25beab9a5f2..237de751fe4b30afa1abcf475ca8af8c52cea7ab 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -53,69 +53,122 @@ namespace Kokkos {
 
   namespace Impl {
 
-  template<class ValuesViewType, int Rank=ValuesViewType::Rank>
+  template< class DstViewType , class SrcViewType
+          , int Rank = DstViewType::Rank >
   struct CopyOp;
 
-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,1> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,1> {
     KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
       dst(i_dst) = src(i_src);
     }
   };
 
-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,2> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,2> {
     KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
-      for(int j = 0;j< (int) dst.dimension_1(); j++)
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0;j< (int) dst.extent(1); j++)
         dst(i_dst,j) = src(i_src,j);
     }
   };
 
-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,3> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,3> {
     KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
-      for(int j = 0; j<dst.dimension_1(); j++)
-        for(int k = 0; k<dst.dimension_2(); k++)
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0; j<dst.extent(1); j++)
+        for(int k = 0; k<dst.extent(2); k++)
           dst(i_dst,j,k) = src(i_src,j,k);
     }
   };
   }
 
-template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
-         class SizeType = typename KeyViewType::memory_space::size_type>
+//----------------------------------------------------------------------------
+
+template< class KeyViewType
+        , class BinSortOp
+        , class Space = typename KeyViewType::device_type
+        , class SizeType = typename KeyViewType::memory_space::size_type
+        >
 class BinSort {
+public:
 
+  template< class DstViewType , class SrcViewType >
+  struct copy_functor {
 
-public:
-  template<class ValuesViewType, class PermuteViewType, class CopyOp>
-  struct bin_sort_sort_functor {
-    typedef ExecutionSpace execution_space;
-    typedef typename ValuesViewType::non_const_type values_view_type;
-    typedef typename ValuesViewType::const_type const_values_view_type;
-    Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
-                 typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
-    values_view_type sorted_values;
-    typename PermuteViewType::const_type sort_order;
-    bin_sort_sort_functor(const_values_view_type values_, values_view_type  sorted_values_, PermuteViewType sort_order_):
-       values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
+    typedef typename SrcViewType::const_type  src_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    src_view_type   src_values ;
+    int             dst_offset ;
+
+    copy_functor( DstViewType  const & dst_values_
+                , int          const & dst_offset_
+                , SrcViewType  const & src_values_
+                )
+      : dst_values( dst_values_ )
+      , src_values( src_values_ )
+      , dst_offset( dst_offset_ )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i) const {
+      // printf("copy: dst(%i) src(%i)\n",i+dst_offset,i);
+      copy_op::copy(dst_values,i+dst_offset,src_values,i);
+    }
+  };
+
+  template< class DstViewType
+          , class PermuteViewType
+          , class SrcViewType
+          >
+  struct copy_permute_functor {
+
+    // If a Kokkos::View then can generate constant random access
+    // otherwise can only use the constant type.
+
+    typedef typename std::conditional
+      < Kokkos::is_view< SrcViewType >::value
+      , Kokkos::View< typename SrcViewType::const_data_type
+                    , typename SrcViewType::array_layout
+                    , typename SrcViewType::device_type
+                    , Kokkos::MemoryTraits<Kokkos::RandomAccess>
+                    >
+      , typename SrcViewType::const_type
+      >::type src_view_type ;
+
+    typedef typename PermuteViewType::const_type  perm_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    perm_view_type  sort_order ;
+    src_view_type   src_values ;
+
+    copy_permute_functor( DstViewType     const & dst_values_
+                        , PermuteViewType const & sort_order_
+                        , SrcViewType     const & src_values_
+                        )
+      : dst_values( dst_values_ )
+      , sort_order( sort_order_ )
+      , src_values( src_values_ )
+      {}
 
     KOKKOS_INLINE_FUNCTION
     void operator() (const int& i)  const {
-      //printf("Sort: %i %i\n",i,sort_order(i));
-      CopyOp::copy(sorted_values,i,values,sort_order(i));
+      // printf("copy_permute: dst(%i) src(%i)\n",i,sort_order(i));
+      copy_op::copy(dst_values,i,src_values,sort_order(i));
     }
   };
 
-  typedef ExecutionSpace execution_space;
+  typedef typename Space::execution_space  execution_space;
   typedef BinSortOp bin_op_type;
 
   struct bin_count_tag {};
@@ -124,84 +177,137 @@ public:
   struct bin_sort_bins_tag {};
 
 public:
+
   typedef SizeType size_type;
   typedef size_type value_type;
 
-  typedef Kokkos::View<size_type*, execution_space> offset_type;
-  typedef Kokkos::View<const int*, execution_space> bin_count_type;
+  typedef Kokkos::View<size_type*, Space> offset_type;
+  typedef Kokkos::View<const int*, Space> bin_count_type;
 
+  typedef typename KeyViewType::const_type  const_key_view_type ;
 
-  typedef Kokkos::View<typename KeyViewType::const_data_type,
-                       typename KeyViewType::array_layout,
-                       typename KeyViewType::memory_space> const_key_view_type;
-  typedef Kokkos::View<typename KeyViewType::const_data_type,
-                       typename KeyViewType::array_layout,
-                       typename KeyViewType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
+  // If a Kokkos::View then can generate constant random access
+  // otherwise can only use the constant type.
+
+  typedef typename std::conditional
+    < Kokkos::is_view< KeyViewType >::value
+    , Kokkos::View< typename KeyViewType::const_data_type,
+                    typename KeyViewType::array_layout,
+                    typename KeyViewType::device_type,
+                    Kokkos::MemoryTraits<Kokkos::RandomAccess> >
+    , const_key_view_type
+    >::type const_rnd_key_view_type;
 
   typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
   typedef typename KeyViewType::const_value_type     const_key_scalar;
 
+  typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic_type ;
+
 private:
+
   const_key_view_type keys;
   const_rnd_key_view_type keys_rnd;
 
 public:
-  BinSortOp bin_op;
 
-  offset_type bin_offsets;
+  BinSortOp             bin_op ;
+  offset_type           bin_offsets ;
+  bin_count_atomic_type bin_count_atomic ;
+  bin_count_type        bin_count_const ;
+  offset_type           sort_order ;
 
-  Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
-  bin_count_type bin_count_const;
-
-  offset_type sort_order;
-
-  bool sort_within_bins;
+  int                   range_begin ;
+  int                   range_end ;
+  bool                  sort_within_bins ;
 
 public:
 
-  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
-  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
-          bool sort_within_bins_ = false)
-     :keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
+  BinSort() {}
 
-    bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
+  //----------------------------------------
+  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
+  BinSort( const_key_view_type  keys_
+         , int                  range_begin_
+         , int                  range_end_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : keys(keys_)
+     , keys_rnd(keys_)
+     , bin_op(bin_op_)
+     , bin_offsets()
+     , bin_count_atomic()
+     , bin_count_const()
+     , sort_order()
+     , range_begin( range_begin_ )
+     , range_end( range_end_ )
+     , sort_within_bins( sort_within_bins_ )
+  {
+    bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
     bin_count_const =  bin_count_atomic;
     bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
-    sort_order =       offset_type("PermutationVector",keys.dimension_0());
-    sort_within_bins = sort_within_bins_;
+    sort_order =       offset_type("PermutationVector",range_end-range_begin);
   }
 
+  BinSort( const_key_view_type  keys_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : BinSort( keys_ , 0 , keys_.extent(0), bin_op_ , sort_within_bins_ ) {}
+
+  //----------------------------------------
   // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
   void create_permute_vector() {
-    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag>    (0,keys.dimension_0()),*this);
-    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
+    const size_t len = range_end - range_begin ;
+    Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_count_tag>    (0,len),*this);
+    Kokkos::parallel_scan(Kokkos::RangePolicy<execution_space,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
 
     Kokkos::deep_copy(bin_count_atomic,0);
-    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag>  (0,keys.dimension_0()),*this);
+    Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_binning_tag>  (0,len),*this);
 
     if(sort_within_bins)
-      Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
+      Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
   }
 
   // Sort a view with respect ot the first dimension using the permutation array
   template<class ValuesViewType>
-  void sort(ValuesViewType values) {
-    ValuesViewType sorted_values = ValuesViewType("Copy",
-           values.dimension_0(),
-           values.dimension_1(),
-           values.dimension_2(),
-           values.dimension_3(),
-           values.dimension_4(),
-           values.dimension_5(),
-           values.dimension_6(),
-           values.dimension_7());
-
-    parallel_for(values.dimension_0(),
-        bin_sort_sort_functor<ValuesViewType, offset_type,
-                              Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
-
-    deep_copy(values,sorted_values);
+  void sort( ValuesViewType const & values)
+  {
+    typedef
+      Kokkos::View< typename ValuesViewType::data_type,
+                    typename ValuesViewType::array_layout,
+                    typename ValuesViewType::device_type >
+        scratch_view_type ;
+
+    const size_t len = range_end - range_begin ;
+
+    scratch_view_type
+      sorted_values("Scratch",
+                    len,
+                    values.extent(1),
+                    values.extent(2),
+                    values.extent(3),
+                    values.extent(4),
+                    values.extent(5),
+                    values.extent(6),
+                    values.extent(7));
+
+    {
+      copy_permute_functor< scratch_view_type /* DstViewType */
+                          , offset_type       /* PermuteViewType */
+                          , ValuesViewType    /* SrcViewType */
+                          >
+        functor( sorted_values , sort_order , values );
+
+      parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
+
+    {
+      copy_functor< ValuesViewType , scratch_view_type >
+        functor( values , range_begin , sorted_values );
+
+      parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
   }
 
   // Get the permutation vector
@@ -217,9 +323,11 @@ public:
   bin_count_type get_bin_count() const {return bin_count_const;}
 
 public:
+
   KOKKOS_INLINE_FUNCTION
   void operator() (const bin_count_tag& tag, const int& i) const {
-    bin_count_atomic(bin_op.bin(keys,i))++;
+    const int j = range_begin + i ;
+    bin_count_atomic(bin_op.bin(keys,j))++;
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -232,10 +340,11 @@ public:
 
   KOKKOS_INLINE_FUNCTION
   void operator() (const bin_binning_tag& tag, const int& i)  const {
-    const int bin = bin_op.bin(keys,i);
+    const int j     = range_begin + i ;
+    const int bin   = bin_op.bin(keys,j);
     const int count = bin_count_atomic(bin)++;
 
-    sort_order(bin_offsets(bin) + count) = i;
+    sort_order(bin_offsets(bin) + count) = j ;
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -262,13 +371,19 @@ public:
   }
 };
 
+//----------------------------------------------------------------------------
+
 template<class KeyViewType>
 struct BinOp1D {
-  const int max_bins_;
-  const double mul_;
+  int max_bins_;
+  double mul_;
   typename KeyViewType::const_value_type range_;
   typename KeyViewType::const_value_type min_;
 
+  BinOp1D():max_bins_(0),mul_(0.0),
+            range_(typename KeyViewType::const_value_type()),
+            min_(typename KeyViewType::const_value_type()) {}
+
   //Construct BinOp with number of bins, minimum value and maxuimum value
   BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
                                typename KeyViewType::const_value_type max )
@@ -302,12 +417,14 @@ struct BinOp3D {
   typename KeyViewType::non_const_value_type range_[3];
   typename KeyViewType::non_const_value_type min_[3];
 
+  BinOp3D() {}
+
   BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
                                typename KeyViewType::const_value_type max[] )
   {
-    max_bins_[0] = max_bins__[0]+1;
-    max_bins_[1] = max_bins__[1]+1;
-    max_bins_[2] = max_bins__[2]+1;
+    max_bins_[0] = max_bins__[0];
+    max_bins_[1] = max_bins__[1];
+    max_bins_[2] = max_bins__[2];
     mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
     mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
     mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
@@ -364,7 +481,7 @@ bool try_std_sort(ViewType view) {
   possible  = possible && (ViewType::Rank == 1);
   possible  = possible && (stride[0] == 1);
   if(possible)  {
-   std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
+   std::sort(view.data(),view.data()+view.extent(0));
   }
   return possible;
 }
@@ -386,7 +503,8 @@ struct min_max_functor {
 }
 
 template<class ViewType>
-void sort(ViewType view, bool always_use_kokkos_sort = false) {
+void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
+{
   if(!always_use_kokkos_sort) {
     if(Impl::try_std_sort(view)) return;
   }
@@ -394,14 +512,37 @@ void sort(ViewType view, bool always_use_kokkos_sort = false) {
 
   Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
   Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
-  parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
+  parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
                   Impl::min_max_functor<ViewType>(view),reducer);
   if(result.min_val == result.max_val) return;
-  BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
+  BinSort<ViewType, CompType> bin_sort(view,CompType(view.extent(0)/2,result.min_val,result.max_val),true);
   bin_sort.create_permute_vector();
   bin_sort.sort(view);
 }
 
+template<class ViewType>
+void sort( ViewType view
+         , size_t const begin
+         , size_t const end
+         )
+{
+  typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
+  typedef BinOp1D<ViewType> CompType;
+
+  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+
+  parallel_reduce( range_policy( begin , end )
+                 , Impl::min_max_functor<ViewType>(view),reducer );
+
+  if(result.min_val == result.max_val) return;
+
+  BinSort<ViewType, CompType>
+    bin_sort(view,begin,end,CompType((end-begin)/2,result.min_val,result.max_val),true);
+
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view);
+}
 }
 
 #endif
diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
index 03e4fb691ef1a4ae6a7bed6471ccba4e3fd53762..61ffa6f43a39ecbb1640a71de5afb9be33cd10dd 100644
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -44,6 +44,7 @@
 
 #include <gtest/gtest.h>
 #include<Kokkos_Core.hpp>
+#include<Kokkos_DynamicView.hpp>
 #include<Kokkos_Random.hpp>
 #include<Kokkos_Sort.hpp>
 
@@ -192,17 +193,81 @@ void test_3D_sort(unsigned int n) {
   double epsilon = 1e-10;
   unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
 
-  printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+  if ( sort_fails )
+    printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+
   ASSERT_EQ(sort_fails,0);
   ASSERT_EQ(equal_sum,1);
 }
 
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, typename KeyType>
+void test_dynamic_view_sort(unsigned int n )
+{
+  typedef typename ExecutionSpace::memory_space memory_space ;
+  typedef Kokkos::Experimental::DynamicView<KeyType*,ExecutionSpace> KeyDynamicViewType;
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+
+  const size_t upper_bound = 2 * n ;
+
+  typename KeyDynamicViewType::memory_pool
+    pool( memory_space() , 2 * n * sizeof(KeyType) );
+
+  KeyDynamicViewType keys("Keys",pool,upper_bound);
+
+  keys.resize_serial(n);
+
+  KeyViewType keys_view("KeysTmp", n );
+
+  // Test sorting array with all numbers equal
+  Kokkos::deep_copy(keys_view,KeyType(1));
+  Kokkos::Experimental::deep_copy(keys,keys_view);
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  Kokkos::Experimental::deep_copy(keys,keys_view);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_before);
+
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Experimental::deep_copy( keys_view , keys );
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  if ( sort_fails != 0 || equal_sum != 1 ) {
+    std::cout << " N = " << n
+              << " ; sum_before = " << sum_before
+              << " ; sum_after = " << sum_after
+              << " ; ratio = " << ratio
+              << std::endl ;
+  }
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+//----------------------------------------------------------------------------
+
 template<class ExecutionSpace, typename KeyType>
 void test_sort(unsigned int N)
 {
   test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
   test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
   test_3D_sort<ExecutionSpace,KeyType>(N);
+  test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
 }
 
 }
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index cb206cf88b2c4e3a4f289bc919cc272e22749f36..09fa5d500abcdfe718a6d3bb12db5c91fc5ec174 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -140,6 +140,9 @@ do
   #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
   -pedantic|-Wpedantic|-ansi)
     ;;
+  #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C"
+  -Woverloaded-virtual)
+    ;;
   #strip -Xcompiler because we add it
   -Xcompiler)
     if [ $first_xcompiler_arg -eq 1 ]; then
@@ -190,7 +193,7 @@ do
     object_files_xlinker="$object_files_xlinker -Xlinker $1"
     ;;
   #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
-  *.dylib)
+  @*|*.dylib)
     object_files="$object_files -Xlinker $1"
     object_files_xlinker="$object_files_xlinker -Xlinker $1"
     ;;
diff --git a/lib/kokkos/cmake/deps/QTHREAD.cmake b/lib/kokkos/cmake/deps/QTHREADS.cmake
similarity index 98%
rename from lib/kokkos/cmake/deps/QTHREAD.cmake
rename to lib/kokkos/cmake/deps/QTHREADS.cmake
index 994b72b20096f4462beab51d19e4410cd73bf05b..c312f2590bcd29197a0cf3fbd5e0b484579a09c2 100644
--- a/lib/kokkos/cmake/deps/QTHREAD.cmake
+++ b/lib/kokkos/cmake/deps/QTHREADS.cmake
@@ -63,8 +63,7 @@
 #    Source:        https://code.google.com/p/qthreads
 #
 
-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
   REQUIRED_HEADERS qthread.h
   REQUIRED_LIBS_NAMES "qthread"
   )
-
diff --git a/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake b/lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
similarity index 98%
rename from lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
rename to lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
index 994b72b20096f4462beab51d19e4410cd73bf05b..c312f2590bcd29197a0cf3fbd5e0b484579a09c2 100644
--- a/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
+++ b/lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
@@ -63,8 +63,7 @@
 #    Source:        https://code.google.com/p/qthreads
 #
 
-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
   REQUIRED_HEADERS qthread.h
   REQUIRED_LIBS_NAMES "qthread"
   )
-
diff --git a/lib/kokkos/config/kokkos_dev/config-core-all.sh b/lib/kokkos/config/kokkos_dev/config-core-all.sh
index fa588c778f68330ff130364e9425d5a6aefa357c..d4fb25a8e139c315a862306173a0b1d2a07e7cbd 100755
--- a/lib/kokkos/config/kokkos_dev/config-core-all.sh
+++ b/lib/kokkos/config/kokkos_dev/config-core-all.sh
@@ -6,7 +6,7 @@
 #-----------------------------------------------------------------------------
 # Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
 #
-#   Cuda, OpenMP, Threads, Qthread, hwloc
+#   Cuda, OpenMP, Threads, Qthreads, hwloc
 #
 # module loaded on 'kokkos-dev.sandia.gov' for this build
 #
@@ -82,13 +82,13 @@ CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
 CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
 
 #-----------------------------------------------------------------------------
-# Qthread
+# Qthreads
 
-QTHREAD_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
+QTHREADS_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
 
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREAD:BOOL=ON"
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_INCLUDE_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/include"
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_LIBRARY_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/lib"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREADS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_INCLUDE_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_LIBRARY_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/lib"
 
 #-----------------------------------------------------------------------------
 # C++11
@@ -108,6 +108,3 @@ rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
 echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
 
 cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
-
-#-----------------------------------------------------------------------------
-
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index 446cbb021610164980cc6dd0fdced42b162422d7..9eaecb5031b1328989e114b50a86ac07c78b8e29 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -4,4 +4,5 @@ tag:  2.01.10    date: 09:27:2016    master: e4119325    develop: e6cda11e
 tag:  2.02.00    date: 10:30:2016    master: 6c90a581    develop: ca3dd56e
 tag:  2.02.01    date: 11:01:2016    master: 9c698c86    develop: b0072304
 tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
-tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6 
+tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
+tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641 
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
index 2c15e951ba25f4831c888fa731b9e25954ee0ead..6909606643df6b83c2dc77c2469768e02a13844d 100755
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@@ -6,29 +6,29 @@
 
 set -o pipefail
 
-# Determine current machine
+# Determine current machine.
 
 MACHINE=""
 HOSTNAME=$(hostname)
 PROCESSOR=`uname -p`
 
 if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
-    MACHINE=white
+  MACHINE=white
 elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
-    MACHINE=bowman
+  MACHINE=bowman
 elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
-    if [[ "$PROCESSOR" = "aarch64" ]]; then
-        MACHINE=sullivan
-    else
-        MACHINE=shepard
-    fi
+  if [[ "$PROCESSOR" = "aarch64" ]]; then
+    MACHINE=sullivan
+  else
+    MACHINE=shepard
+  fi
 elif [[ "$HOSTNAME" =~ apollo ]]; then
-    MACHINE=apollo
+  MACHINE=apollo
 elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
-    MACHINE=sems
+  MACHINE=sems
 else
-    echo "Unrecognized machine" >&2
-    exit 1
+  echo "Unrecognized machine" >&2
+  exit 1
 fi
 
 GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
@@ -45,10 +45,11 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
 INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CUDA_WARNING_FLAGS=""
 
-# Default. Machine specific can override
+# Default. Machine specific can override.
 DEBUG=False
 ARGS=""
 CUSTOM_BUILD_LIST=""
+QTHREADS_PATH=""
 DRYRUN=False
 BUILD_ONLY=False
 declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
@@ -60,86 +61,90 @@ PRINT_HELP=False
 OPT_FLAG=""
 KOKKOS_OPTIONS=""
 
-
 #
-# Handle arguments
+# Handle arguments.
 #
 
 while [[ $# > 0 ]]
 do
-key="$1"
-case $key in
---kokkos-path*)
-KOKKOS_PATH="${key#*=}"
-;;
---build-list*)
-CUSTOM_BUILD_LIST="${key#*=}"
-;;
---debug*)
-DEBUG=True
-;;
---build-only*)
-BUILD_ONLY=True
-;;
---test-script*)
-TEST_SCRIPT=True
-;;
---skip-hwloc*)
-SKIP_HWLOC=True
-;;
---num*)
-NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
-;;
---dry-run*)
-DRYRUN=True
-;;
---spot-check*)
-SPOT_CHECK=True
-;;
---arch*)
-ARCH_FLAG="--arch=${key#*=}"
-;;
---opt-flag*)
-OPT_FLAG="${key#*=}"
-;;
---with-cuda-options*)
-KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
-;;
---help*)
-PRINT_HELP=True
-;;
-*)
-# args, just append
-ARGS="$ARGS $1"
-;;
-esac
-shift
+  key="$1"
+
+  case $key in
+    --kokkos-path*)
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
+    --build-list*)
+      CUSTOM_BUILD_LIST="${key#*=}"
+      ;;
+    --debug*)
+      DEBUG=True
+      ;;
+    --build-only*)
+      BUILD_ONLY=True
+      ;;
+    --test-script*)
+      TEST_SCRIPT=True
+      ;;
+    --skip-hwloc*)
+      SKIP_HWLOC=True
+      ;;
+    --num*)
+      NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
+      ;;
+    --dry-run*)
+      DRYRUN=True
+      ;;
+    --spot-check*)
+      SPOT_CHECK=True
+      ;;
+    --arch*)
+      ARCH_FLAG="--arch=${key#*=}"
+      ;;
+    --opt-flag*)
+      OPT_FLAG="${key#*=}"
+      ;;
+    --with-cuda-options*)
+      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
+      ;;
+    --help*)
+      PRINT_HELP=True
+      ;;
+    *)
+      # args, just append
+      ARGS="$ARGS $1"
+      ;;
+  esac
+
+  shift
 done
 
 SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
 
-# set kokkos path
+# Set kokkos path.
 if [ -z "$KOKKOS_PATH" ]; then
-    KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
+  KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
 else
-    # Ensure KOKKOS_PATH is abs path
-    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+  # Ensure KOKKOS_PATH is abs path.
+  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
 fi
 
 #
-# Machine specific config
+# Machine specific config.
 #
 
 if [ "$MACHINE" = "sems" ]; then
-    source /projects/sems/modulefiles/utils/sems-modules-init.sh
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
-    BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
-    CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
-    CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
+  BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
+  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
+  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG=""
-    fi 
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG=""
+  fi
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
@@ -153,120 +158,118 @@ if [ "$MACHINE" = "sems" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
-
 elif [ "$MACHINE" = "white" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=32
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-    IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
-    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
+  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
 
-    # Don't do pthread on white
-    GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+  # Don't do pthread on white.
+  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
-               "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=Power8,Kepler37"
-    fi
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+             "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=Power8,Kepler37"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "bowman" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=32
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
 
-    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-    )
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  )
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=KNL"
-    fi
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=KNL"
+  fi
 
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "sullivan" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=96
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=96
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS")
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS")
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=ARMv8-ThunderX"
-    fi
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=ARMv8-ThunderX"
+  fi
 
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "shepard" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=32
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
 
-    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-    )
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  )
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=HSW"
-    fi
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=HSW"
+  fi
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "apollo" ]; then
-    source /projects/sems/modulefiles/utils/sems-modules-init.sh
-    module use /home/projects/modulefiles/local/x86-64
-    module load kokkos-env
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
+  module use /home/projects/modulefiles/local/x86-64
+  module load kokkos-env
 
-    module load sems-git
-    module load sems-tex
-    module load sems-cmake/3.5.2
-    module load sems-gdb
+  module load sems-git
+  module load sems-tex
+  module load sems-cmake/3.5.2
+  module load sems-gdb
 
-    SKIP_HWLOC=True
+  SKIP_HWLOC=True
 
-    BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
-    CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
-    CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
+  BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
+  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
+  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
 
-    CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/8.0.44"
-    NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+  CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/8.0.44"
+  NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
 
-    BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
-    BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
-    BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
+  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
+  BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
+  BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
@@ -297,16 +300,16 @@ elif [ "$MACHINE" = "apollo" ]; then
     )
   fi
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=SNB,Kepler35"
-    fi
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
-else
-    echo "Unhandled machine $MACHINE" >&2
-    exit 1
-fi
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=SNB,Kepler35"
+  fi
 
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
+else
+  echo "Unhandled machine $MACHINE" >&2
+  exit 1
+fi
 
 export OMP_NUM_THREADS=4
 
@@ -315,119 +318,149 @@ declare -i NUM_RESULTS_TO_KEEP=7
 RESULT_ROOT_PREFIX=TestAll
 
 if [ "$PRINT_HELP" = "True" ]; then
-echo "test_all_sandia <ARGS> <OPTIONS>:"
-echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
-echo "    Defaults to root repo containing this script"
-echo "--debug: Run tests in debug. Defaults to False"
-echo "--test-script: Test this script, not Kokkos"
-echo "--skip-hwloc: Do not do hwloc tests"
-echo "--num=N: Number of jobs to run in parallel"
-echo "--spot-check: Minimal test set to issue pull request"
-echo "--dry-run: Just print what would be executed"
-echo "--build-only: Just do builds, don't run anything"
-echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
-echo "--arch=ARCHITECTURE: overwrite architecture flags"
-echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
-echo "--build-list=BUILD,BUILD,BUILD..."
-echo "    Provide a comma-separated list of builds instead of running all builds"
-echo "    Valid items:"
-echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
-echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
-echo ""
-
-echo "ARGS: list of expressions matching compilers to test"
-echo "  supported compilers sems"
-for COMPILER_DATA in "${COMPILERS[@]}"; do
+  echo "test_all_sandia <ARGS> <OPTIONS>:"
+  echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+  echo "    Defaults to root repo containing this script"
+  echo "--debug: Run tests in debug. Defaults to False"
+  echo "--test-script: Test this script, not Kokkos"
+  echo "--skip-hwloc: Do not do hwloc tests"
+  echo "--num=N: Number of jobs to run in parallel"
+  echo "--spot-check: Minimal test set to issue pull request"
+  echo "--dry-run: Just print what would be executed"
+  echo "--build-only: Just do builds, don't run anything"
+  echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
+  echo "--arch=ARCHITECTURE: overwrite architecture flags"
+  echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
+  echo "--build-list=BUILD,BUILD,BUILD..."
+  echo "    Provide a comma-separated list of builds instead of running all builds"
+  echo "    Valid items:"
+  echo "      OpenMP, Pthread, Qthreads, Serial, OpenMP_Serial, Pthread_Serial"
+  echo "      Qthreads_Serial, Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
+  echo ""
+
+  echo "ARGS: list of expressions matching compilers to test"
+  echo "  supported compilers sems"
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
     ARR=($COMPILER_DATA)
     COMPILER=${ARR[0]}
     echo "    $COMPILER"
-done
-echo ""
-
-echo "Examples:"
-echo "  Run all tests"
-echo "  % test_all_sandia"
-echo ""
-echo "  Run all gcc tests"
-echo "  % test_all_sandia gcc"
-echo ""
-echo "  Run all gcc/4.7.2 and all intel tests"
-echo "  % test_all_sandia gcc/4.7.2 intel"
-echo ""
-echo "  Run all tests in debug"
-echo "  % test_all_sandia --debug"
-echo ""
-echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
-echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
-echo ""
-echo "If you want to kill the tests, do:"
-echo "  hit ctrl-z"
-echo "  % kill -9 %1"
-echo
-exit 0
+  done
+  echo ""
+
+  echo "Examples:"
+  echo "  Run all tests"
+  echo "  % test_all_sandia"
+  echo ""
+  echo "  Run all gcc tests"
+  echo "  % test_all_sandia gcc"
+  echo ""
+  echo "  Run all gcc/4.7.2 and all intel tests"
+  echo "  % test_all_sandia gcc/4.7.2 intel"
+  echo ""
+  echo "  Run all tests in debug"
+  echo "  % test_all_sandia --debug"
+  echo ""
+  echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
+  echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
+  echo ""
+  echo "If you want to kill the tests, do:"
+  echo "  hit ctrl-z"
+  echo "  % kill -9 %1"
+  echo
+  exit 0
 fi
 
-# set build type
+# Set build type.
 if [ "$DEBUG" = "True" ]; then
-    BUILD_TYPE=debug
+  BUILD_TYPE=debug
 else
-    BUILD_TYPE=release
+  BUILD_TYPE=release
 fi
 
-# If no args provided, do all compilers
+# If no args provided, do all compilers.
 if [ -z "$ARGS" ]; then
-    ARGS='?'
+  ARGS='?'
 fi
 
-# Process args to figure out which compilers to test
+# Process args to figure out which compilers to test.
 COMPILERS_TO_TEST=""
+
 for ARG in $ARGS; do
-    for COMPILER_DATA in "${COMPILERS[@]}"; do
-        ARR=($COMPILER_DATA)
-        COMPILER=${ARR[0]}
-        if [[ "$COMPILER" = $ARG* ]]; then
-            if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
-                COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
-            else
-                echo "Tried to add $COMPILER twice"
-            fi
-        fi
-    done
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+
+    if [[ "$COMPILER" = $ARG* ]]; then
+      if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
+        COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
+      else
+        echo "Tried to add $COMPILER twice"
+      fi
+    fi
+  done
 done
 
+# Check if Qthreads build requested.
+HAVE_QTHREADS_BUILD="False"
+if [ -n "$CUSTOM_BUILD_LIST" ]; then
+  if [[ "$CUSTOM_BUILD_LIST" = *Qthreads* ]]; then
+    HAVE_QTHREADS_BUILD="True"
+  fi
+else
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    BUILD_LIST=${ARR[2]}
+    if [[ "$BUILD_LIST" = *Qthreads* ]]; then
+      HAVE_QTHREADS_BUILD="True"
+    fi
+  done
+fi
+
+# Ensure Qthreads path is set if Qthreads build is requested.
+if [ "$HAVE_QTHREADS_BUILD" = "True" ]; then
+  if [ -z "$QTHREADS_PATH" ]; then
+    echo "Need to supply Qthreads path (--qthreads-path) when testing Qthreads backend." >&2
+    exit 1
+  else
+    # Strip trailing slashes from path.
+    QTHREADS_PATH=$(echo $QTHREADS_PATH | sed 's/\/*$//')
+  fi
+fi
+
 #
-# Functions
+# Functions.
 #
 
 # get_compiler_name <COMPILER>
 get_compiler_name() {
-    echo $1 | cut -d/ -f1
+  echo $1 | cut -d/ -f1
 }
 
 # get_compiler_version <COMPILER>
 get_compiler_version() {
-    echo $1 | cut -d/ -f2
+  echo $1 | cut -d/ -f2
 }
 
-# Do not call directly
+# Do not call directly.
 get_compiler_data() {
-    local compiler=$1
-    local item=$2
-    local compiler_name=$(get_compiler_name $compiler)
-    local compiler_vers=$(get_compiler_version $compiler)
-
-    local compiler_data
-    for compiler_data in "${COMPILERS[@]}" ; do
-        local arr=($compiler_data)
-        if [ "$compiler" = "${arr[0]}" ]; then
-            echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
-            return 0
-        fi
-    done
-
-    # Not found
-    echo "Unreconized compiler $compiler" >&2
-    exit 1
+  local compiler=$1
+  local item=$2
+  local compiler_name=$(get_compiler_name $compiler)
+  local compiler_vers=$(get_compiler_version $compiler)
+
+  local compiler_data
+  for compiler_data in "${COMPILERS[@]}" ; do
+    local arr=($compiler_data)
+
+    if [ "$compiler" = "${arr[0]}" ]; then
+      echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
+      return 0
+    fi
+  done
+
+  # Not found.
+  echo "Unreconized compiler $compiler" >&2
+  exit 1
 }
 
 #
@@ -435,227 +468,232 @@ get_compiler_data() {
 #
 
 get_compiler_modules() {
-    get_compiler_data $1 1
+  get_compiler_data $1 1
 }
 
 get_compiler_build_list() {
-    get_compiler_data $1 2
+  get_compiler_data $1 2
 }
 
 get_compiler_exe_name() {
-    get_compiler_data $1 3
+  get_compiler_data $1 3
 }
 
 get_compiler_warning_flags() {
-    get_compiler_data $1 4
+  get_compiler_data $1 4
 }
 
 run_cmd() {
-    echo "RUNNING: $*"
-    if [ "$DRYRUN" != "True" ]; then
-	eval "$* 2>&1"
-    fi
+  echo "RUNNING: $*"
+  if [ "$DRYRUN" != "True" ]; then
+    eval "$* 2>&1"
+  fi
 }
 
 # report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
 report_and_log_test_result() {
-    # Use sane var names
-    local success=$1; local desc=$2; local comment=$3;
+  # Use sane var names.
+  local success=$1; local desc=$2; local comment=$3;
 
-    if [ "$success" = "0" ]; then
-	echo "  PASSED $desc"
-        echo $comment > $PASSED_DIR/$desc
-    else
-        # For failures, comment should be the name of the phase that failed
-	echo "  FAILED $desc" >&2
-        echo $comment > $FAILED_DIR/$desc
-        cat ${desc}.${comment}.log
-    fi
+  if [ "$success" = "0" ]; then
+    echo "  PASSED $desc"
+    echo $comment > $PASSED_DIR/$desc
+  else
+    # For failures, comment should be the name of the phase that failed.
+    echo "  FAILED $desc" >&2
+    echo $comment > $FAILED_DIR/$desc
+    cat ${desc}.${comment}.log
+  fi
 }
 
 setup_env() {
-    local compiler=$1
-    local compiler_modules=$(get_compiler_modules $compiler)
-
-    module purge
-
-    local mod
-    for mod in $compiler_modules; do
-        echo "Loading module $mod"
-	module load $mod 2>&1
-        # It is ridiculously hard to check for the success of a loaded
-        # module. Module does not return error codes and piping to grep
-        # causes module to run in a subshell.
-        module list 2>&1 | grep "$mod" >& /dev/null || return 1
-    done
-
-    return 0
+  local compiler=$1
+  local compiler_modules=$(get_compiler_modules $compiler)
+
+  module purge
+
+  local mod
+  for mod in $compiler_modules; do
+    echo "Loading module $mod"
+    module load $mod 2>&1
+    # It is ridiculously hard to check for the success of a loaded
+    # module. Module does not return error codes and piping to grep
+    # causes module to run in a subshell.
+    module list 2>&1 | grep "$mod" >& /dev/null || return 1
+  done
+
+  return 0
 }
 
 # single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
 single_build_and_test() {
-    # Use sane var names
-    local compiler=$1; local build=$2; local build_type=$3;
+  # Use sane var names.
+  local compiler=$1; local build=$2; local build_type=$3;
+
+  # Set up env.
+  mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
+  cd $ROOT_DIR/$compiler/"${build}-$build_type"
+  local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
+  setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
 
-    # set up env
-    mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
-    cd $ROOT_DIR/$compiler/"${build}-$build_type"
-    local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
-    setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+  # Set up flags.
+  local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
+  local compiler_exe=$(get_compiler_exe_name $compiler)
 
-    # Set up flags
-    local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
-    local compiler_exe=$(get_compiler_exe_name $compiler)
+  if [[ "$build_type" = hwloc* ]]; then
+    local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
+  fi
 
+  if [[ "$build" = *Qthreads* ]]; then
     if [[ "$build_type" = hwloc* ]]; then
-        local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
+      local extra_args="$extra_args --qthreads-path=${QTHREADS_PATH}_hwloc"
+    else
+      local extra_args="$extra_args --qthreads-path=$QTHREADS_PATH"
     fi
+  fi
 
-    if [[ "$OPT_FLAG" = "" ]]; then
-      OPT_FLAG="-O3"
-    fi
+  if [[ "$OPT_FLAG" = "" ]]; then
+    OPT_FLAG="-O3"
+  fi
 
-    if [[ "$build_type" = *debug* ]]; then
-        local extra_args="$extra_args --debug"
-        local cxxflags="-g $compiler_warning_flags"
-    else
-        local cxxflags="$OPT_FLAG $compiler_warning_flags"
-    fi
+  if [[ "$build_type" = *debug* ]]; then
+    local extra_args="$extra_args --debug"
+    local cxxflags="-g $compiler_warning_flags"
+  else
+    local cxxflags="$OPT_FLAG $compiler_warning_flags"
+  fi
 
-    if [[ "$compiler" == cuda* ]]; then
-        cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
-        export TMPDIR=$(pwd)
-    fi
+  if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
+    local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
+  fi
 
-    if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
-        local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
-    fi
+  echo "  Starting job $desc"
 
-    echo "  Starting job $desc"
+  local comment="no_comment"
 
-    local comment="no_comment"
+  if [ "$TEST_SCRIPT" = "True" ]; then
+    local rand=$[ 1 + $[ RANDOM % 10 ]]
+    sleep $rand
 
-    if [ "$TEST_SCRIPT" = "True" ]; then
-        local rand=$[ 1 + $[ RANDOM % 10 ]]
-        sleep $rand
-        if [ $rand -gt 5 ]; then
-            run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
-        fi
-    else
-        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
-        local -i build_start_time=$(date +%s)
-        run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
-        local -i build_end_time=$(date +%s)
-        comment="build_time=$(($build_end_time-$build_start_time))"
-        if [[ "$BUILD_ONLY" == False ]]; then
-            run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
-            local -i run_end_time=$(date +%s)
-            comment="$comment run_time=$(($run_end_time-$build_end_time))"
-        fi
+    if [ $rand -gt 5 ]; then
+      run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
     fi
+  else
+    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    local -i build_start_time=$(date +%s)
+    run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    local -i build_end_time=$(date +%s)
+    comment="build_time=$(($build_end_time-$build_start_time))"
+
+    if [[ "$BUILD_ONLY" == False ]]; then
+      run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+      local -i run_end_time=$(date +%s)
+      comment="$comment run_time=$(($run_end_time-$build_end_time))"
+    fi
+  fi
 
-    report_and_log_test_result 0 $desc "$comment"
+  report_and_log_test_result 0 $desc "$comment"
 
-    return 0
+  return 0
 }
 
 # wait_for_jobs <NUM-JOBS>
 wait_for_jobs() {
-    local -i max_jobs=$1
-    local -i num_active_jobs=$(jobs | wc -l)
-    while [ $num_active_jobs -ge $max_jobs ]
-    do
-        sleep 1
-        num_active_jobs=$(jobs | wc -l)
-        jobs >& /dev/null
-    done
+  local -i max_jobs=$1
+  local -i num_active_jobs=$(jobs | wc -l)
+  while [ $num_active_jobs -ge $max_jobs ]
+  do
+    sleep 1
+    num_active_jobs=$(jobs | wc -l)
+    jobs >& /dev/null
+  done
 }
 
 # run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
 run_in_background() {
-    local compiler=$1
-
-    local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
-    # don't override command line input
-    # if [[ "$BUILD_ONLY" == True ]]; then
-        # num_jobs=8
-    # else
-        if [[ "$compiler" == cuda* ]]; then
-            num_jobs=1
-        fi
-    # fi
-    wait_for_jobs $num_jobs
-
-    single_build_and_test $* &
+  local compiler=$1
+
+  local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
+  # Don't override command line input.
+  # if [[ "$BUILD_ONLY" == True ]]; then
+  #   num_jobs=8
+  # else
+    if [[ "$compiler" == cuda* ]]; then
+      num_jobs=1
+    fi
+  # fi
+  wait_for_jobs $num_jobs
+
+  single_build_and_test $* &
 }
 
 # build_and_test_all <COMPILER>
 build_and_test_all() {
-    # Get compiler data
-    local compiler=$1
-    if [ -z "$CUSTOM_BUILD_LIST" ]; then
-	local compiler_build_list=$(get_compiler_build_list $compiler)
-    else
-	local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
-    fi
+  # Get compiler data.
+  local compiler=$1
+  if [ -z "$CUSTOM_BUILD_LIST" ]; then
+    local compiler_build_list=$(get_compiler_build_list $compiler)
+  else
+    local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
+  fi
 
-    # do builds
-    local build
-    for build in $compiler_build_list
-    do
-	run_in_background $compiler $build $BUILD_TYPE
+  # Do builds.
+  local build
+  for build in $compiler_build_list
+  do
+    run_in_background $compiler $build $BUILD_TYPE
 
-        # If not cuda, do a hwloc test too
-        if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
-            run_in_background $compiler $build "hwloc-$BUILD_TYPE"
-        fi
-    done
+    # If not cuda, do a hwloc test too.
+    if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
+      run_in_background $compiler $build "hwloc-$BUILD_TYPE"
+    fi
+  done
 
-    return 0
+  return 0
 }
 
 get_test_root_dir() {
-    local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
-    local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
-    local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
+  local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
+  local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
+  local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
 
-    if [ $num_to_delete -gt 0 ]; then
-        /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
-    fi
+  if [ $num_to_delete -gt 0 ]; then
+    /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
+  fi
 
-    echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
+  echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
 }
 
 wait_summarize_and_exit() {
-    wait_for_jobs 1
-
-    echo "#######################################################"
-    echo "PASSED TESTS"
-    echo "#######################################################"
-
-    local passed_test
-    for passed_test in $(\ls -1 $PASSED_DIR | sort)
-    do
-        echo $passed_test $(cat $PASSED_DIR/$passed_test)
-    done
-
-    echo "#######################################################"
-    echo "FAILED TESTS"
-    echo "#######################################################"
-
-    local failed_test
-    local -i rv=0
-    for failed_test in $(\ls -1 $FAILED_DIR | sort)
-    do
-        echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
-        rv=$rv+1
-    done
-
-    exit $rv
+  wait_for_jobs 1
+
+  echo "#######################################################"
+  echo "PASSED TESTS"
+  echo "#######################################################"
+
+  local passed_test
+  for passed_test in $(\ls -1 $PASSED_DIR | sort)
+  do
+    echo $passed_test $(cat $PASSED_DIR/$passed_test)
+  done
+
+  echo "#######################################################"
+  echo "FAILED TESTS"
+  echo "#######################################################"
+
+  local failed_test
+  local -i rv=0
+  for failed_test in $(\ls -1 $FAILED_DIR | sort)
+  do
+    echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
+    rv=$rv+1
+  done
+
+  exit $rv
 }
 
 #
-# Main
+# Main.
 #
 
 ROOT_DIR=$(get_test_root_dir)
@@ -669,8 +707,8 @@ mkdir -p $FAILED_DIR
 
 echo "Going to test compilers: " $COMPILERS_TO_TEST
 for COMPILER in $COMPILERS_TO_TEST; do
-    echo "Testing compiler $COMPILER"
-    build_and_test_all $COMPILER
+  echo "Testing compiler $COMPILER"
+  build_and_test_all $COMPILER
 done
 
 wait_summarize_and_exit
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index 3277c007d0845485a57ed7aabfa35202f1b22d1b..53e0eab693afeca7bbe0c164666612dc5ccc36d9 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -60,7 +60,7 @@ class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
 { 
 public:
 
-  typedef ViewTraits< DataType , P ... >  traits ;
+  typedef Kokkos::ViewTraits< DataType , P ... >  traits ;
 
 private:
 
@@ -123,30 +123,41 @@ public:
 
   enum { Rank = 1 };
 
-  KOKKOS_INLINE_FUNCTION constexpr size_t size() const
+  KOKKOS_INLINE_FUNCTION
+  size_t size() const noexcept
     {
-      return
-        Kokkos::Impl::MemorySpaceAccess
-          < Kokkos::Impl::ActiveExecutionMemorySpace
-          , typename traits::memory_space
-          >::accessible 
-        ? // Runtime size is at the end of the chunk pointer array
-          (*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
-          << m_chunk_shift
-        : 0 ;
+      uintptr_t n = 0 ;
+
+      if ( Kokkos::Impl::MemorySpaceAccess
+            < Kokkos::Impl::ActiveExecutionMemorySpace
+            , typename traits::memory_space
+            >::accessible ) {
+        n = *reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max );
+      }
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      else {
+        Kokkos::Impl::DeepCopy< Kokkos::HostSpace
+                              , typename traits::memory_space
+                              , Kokkos::HostSpace::execution_space >
+          ( & n
+          , reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max )
+          , sizeof(uintptr_t) );
+      }
+#endif
+      return n << m_chunk_shift ;
     }
 
   template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
+  KOKKOS_INLINE_FUNCTION
   size_t extent( const iType & r ) const
     { return r == 0 ? size() : 1 ; }
 
   template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
+  KOKKOS_INLINE_FUNCTION
   size_t extent_int( const iType & r ) const
     { return r == 0 ? size() : 1 ; }
 
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return size(); }
+  KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
   KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
   KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
   KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
@@ -270,10 +281,18 @@ public:
     }
 
   /** \brief  Resizing in serial can grow or shrink the array size, */
+  template< typename IntType >
   inline
-  void resize_serial( size_t n )
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                     , typename traits::memory_space
+                                     >::accessible
+    >::type
+  resize_serial( IntType const & n )
     {
-      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
+      typedef typename traits::value_type value_type ;
+      typedef value_type * pointer_type ;
 
       const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
 
@@ -286,8 +305,8 @@ public:
 
       if ( *pc < NC ) {
         while ( *pc < NC ) {
-          m_chunks[*pc] =
-            m_pool.allocate( sizeof(traits::value_type) << m_chunk_shift );
+          m_chunks[*pc] = reinterpret_cast<pointer_type>
+            ( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
           ++*pc ;
         }
       }
@@ -295,12 +314,90 @@ public:
         while ( NC + 1 <= *pc ) {
           --*pc ;        
           m_pool.deallocate( m_chunks[*pc]
-                           , sizeof(traits::value_type) << m_chunk_shift );
+                           , sizeof(value_type) << m_chunk_shift );
           m_chunks[*pc] = 0 ;
         }
       }
     }
 
+  //----------------------------------------
+
+  struct ResizeSerial {
+    memory_pool                    m_pool ;
+    typename traits::value_type ** m_chunks ;
+    uintptr_t                    * m_pc ;
+    uintptr_t                      m_nc ;
+    unsigned                       m_chunk_shift ;  
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( int ) const
+      {
+        typedef typename traits::value_type value_type ;
+        typedef value_type * pointer_type ;
+
+        if ( *m_pc < m_nc ) {
+          while ( *m_pc < m_nc ) {
+            m_chunks[*m_pc] = reinterpret_cast<pointer_type>
+              ( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
+            ++*m_pc ;
+          }
+        }
+        else {
+          while ( m_nc + 1 <= *m_pc ) {
+            --*m_pc ;        
+            m_pool.deallocate( m_chunks[*m_pc]
+                             , sizeof(value_type) << m_chunk_shift );
+            m_chunks[*m_pc] = 0 ;
+          }
+        }
+      }
+
+    ResizeSerial( memory_pool            const & arg_pool
+                , typename traits::value_type ** arg_chunks
+                , uintptr_t                    * arg_pc
+                , uintptr_t                      arg_nc
+                , unsigned                       arg_chunk_shift
+                )
+      : m_pool( arg_pool )
+      , m_chunks( arg_chunks )
+      , m_pc( arg_pc )
+      , m_nc( arg_nc )
+      , m_chunk_shift( arg_chunk_shift )
+      {}
+  };
+
+  template< typename IntType >
+  inline
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                       , typename traits::memory_space
+                                       >::accessible
+    >::type
+  resize_serial( IntType const & n )
+    {
+      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
+
+      if ( m_chunk_max < NC ) {
+        Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
+      }
+
+      // Must dispatch kernel
+
+      typedef Kokkos::RangePolicy< typename traits::execution_space > Range ;
+
+      uintptr_t * const pc =
+        reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
+
+      Kokkos::Impl::ParallelFor<ResizeSerial,Range>
+        closure( ResizeSerial( m_pool, m_chunks, pc, NC, m_chunk_shift )
+               , Range(0,1) );
+
+      closure.execute();
+
+      traits::execution_space::fence();
+    }
+
   //----------------------------------------------------------------------
 
   ~DynamicView() = default ;
@@ -311,15 +408,17 @@ public:
   DynamicView & operator = ( const DynamicView & ) = default ;
 
   template< class RT , class ... RP >
-  KOKKOS_INLINE_FUNCTION
   DynamicView( const DynamicView<RT,RP...> & rhs )
     : m_pool( rhs.m_pool )
     , m_track( rhs.m_track )
-    , m_chunks( rhs.m_chunks )
+    , m_chunks( (typename traits::value_type **) rhs.m_chunks )
     , m_chunk_shift( rhs.m_chunk_shift )
     , m_chunk_mask( rhs.m_chunk_mask )
     , m_chunk_max( rhs.m_chunk_max )
     {
+      typedef typename DynamicView<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynamicView copy construction" );
     }
 
   //----------------------------------------------------------------------
@@ -400,8 +499,6 @@ public:
     , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
     , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
     {
-      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
-
       // A functor to deallocate all of the chunks upon final destruction
 
       typedef typename traits::memory_space  memory_space ;
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index 8646d277921aff5c71b70c48d768ee39944b3455..193f1bc334dd76177e3823f6decee9dbd71b137e 100644
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -230,16 +230,17 @@ public:
   typedef typename Impl::remove_const<declared_value_type>::type value_type;
   typedef typename Impl::add_const<value_type>::type const_value_type;
 
-  typedef Device execution_space;
+  typedef Device device_type;
+  typedef typename Device::execution_space execution_space;
   typedef Hasher hasher_type;
   typedef EqualTo  equal_to_type;
   typedef uint32_t size_type;
 
   //map_types
-  typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
-  typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type>                   insertable_map_type;
-  typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type>             modifiable_map_type;
-  typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type>       const_map_type;
+  typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type>       const_map_type;
 
   static const bool is_set = std::is_same<void,value_type>::value;
   static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
@@ -264,18 +265,18 @@ private:
   typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
 
   typedef typename Impl::if_c<   is_insertable_map
-                               , View< key_type *, execution_space>
-                               , View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< key_type *, device_type>
+                               , View< const key_type *, device_type, MemoryTraits<RandomAccess> >
                              >::type key_type_view;
 
   typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
-                               , View< impl_value_type *, execution_space>
-                               , View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< impl_value_type *, device_type>
+                               , View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
                              >::type value_type_view;
 
   typedef typename Impl::if_c<   is_insertable_map
-                               , View< size_type *, execution_space>
-                               , View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< size_type *, device_type>
+                               , View< const size_type *, device_type, MemoryTraits<RandomAccess> >
                              >::type size_type_view;
 
   typedef typename Impl::if_c<   is_insertable_map
@@ -285,7 +286,7 @@ private:
 
   enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
   enum { num_scalars = 3 };
-  typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
+  typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
 
 public:
   //! \name Public member functions
@@ -757,7 +758,7 @@ public:
 
       Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
 
-      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
+      typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
 
       raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
       raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
@@ -781,21 +782,21 @@ private: // private member functions
 
   void set_flag(int flag) const
   {
-    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
     const int true_ = true;
     raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
   }
 
   void reset_flag(int flag) const
   {
-    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
     const int false_ = false;
     raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
   }
 
   bool get_flag(int flag) const
   {
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
     int result = false;
     raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
     return result;
diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt
index b9d860f32fd854a59e0258adabdc540a1ef0c512..0c59c616d620598b835525eb70410d0a26f6af6b 100644
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@@ -3,38 +3,49 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
 
-SET(SOURCES
-  UnitTestMain.cpp 
-  TestCuda.cpp
-  )
-
 SET(LIBRARIES kokkoscore)
 
 IF(Kokkos_ENABLE_Pthread)
-  LIST( APPEND SOURCES
-    TestThreads.cpp
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Threads
+  SOURCES TestThreads.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
   )
 ENDIF()
 
 IF(Kokkos_ENABLE_Serial)
-  LIST( APPEND SOURCES
-    TestSerial.cpp
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Serial
+  SOURCES TestSerial.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
   )
 ENDIF()
 
 IF(Kokkos_ENABLE_OpenMP)
-  LIST( APPEND SOURCES
-    TestOpenMP.cpp
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_OpenMP
+  SOURCES TestOpenMP.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
   )
 ENDIF()
 
-
+IF(Kokkos_ENABLE_Cuda)
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest
-  SOURCES ${SOURCES}
+  UnitTest_Cuda
+  SOURCES TestCuda.cpp UnitTestMain.cpp
   COMM serial mpi
   NUM_MPI_PROCS 1
   FAIL_REGULAR_EXPRESSION "  FAILED  "
   TESTONLYLIBS kokkos_gtest
   )
-  
+ENDIF()
+
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
index 7e3ca005f4b6401a088208fca120c097143afc49..beb07bd791cf162c31706b1eeaf31a4c25c91ba5 100644
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -64,6 +64,7 @@ struct TestDynamicView
   typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
 
   typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
+  typedef typename view_type::const_type const_view_type ;
 
   typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type ;
   typedef double value_type;
@@ -136,6 +137,8 @@ struct TestDynamicView
 
     view_type da("A",pool,arg_total_size);
 
+    const_view_type ca(da);
+
 // printf("TestDynamicView::run(%d) construct test functor\n",arg_total_size);
 
     TestDynamicView functor(da,arg_total_size);
diff --git a/lib/kokkos/core/cmake/Dependencies.cmake b/lib/kokkos/core/cmake/Dependencies.cmake
index ae9a20c50efeadec69ab22e3365cd3ec26a5e451..8d9872725e59655f256a9e62bf3f706a79e80e59 100644
--- a/lib/kokkos/core/cmake/Dependencies.cmake
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
   TEST_OPTIONAL_TPLS CUSPARSE
   )
 
-TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
\ No newline at end of file
+TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
index 9359b5a32b71f06230ea8a2e878e0f457f8eee85..a71e60f20742edd8417365bb99c45f172dc5b218 100644
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@@ -30,7 +30,7 @@
 
 #cmakedefine KOKKOS_HAVE_PTHREAD
 #cmakedefine KOKKOS_HAVE_SERIAL
-#cmakedefine KOKKOS_HAVE_QTHREAD
+#cmakedefine KOKKOS_HAVE_QTHREADS
 #cmakedefine KOKKOS_HAVE_Winthread
 #cmakedefine KOKKOS_HAVE_OPENMP
 #cmakedefine KOKKOS_HAVE_HWLOC
diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile
index 85f869971a33c349769bd318af28759f3e3eca12..3a0ad2d4c16a4e16d73e91eec131ee092bf9f47e 100644
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@@ -60,4 +60,3 @@ clean: kokkos-clean
 
 gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
-
diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
index 7386ecef2032f32da8d4e672999e09021b5a673c..65ce61fb53b9e5d8025f1f6f59e8ecf194ec45f0 100644
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@@ -52,6 +52,8 @@
 
 #include <impl/Kokkos_Timer.hpp>
 
+#include <PerfTestMDRange.hpp>
+
 #include <PerfTestHexGrad.hpp>
 #include <PerfTestBlasKernels.hpp>
 #include <PerfTestGramSchmidt.hpp>
@@ -72,6 +74,14 @@ class cuda : public ::testing::Test {
     }
 };
 
+//TEST_F( cuda, mdrange_lr ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutRight>( 5, 8, "Kokkos::Cuda" )) );
+//}
+
+//TEST_F( cuda, mdrange_ll ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutLeft>( 5, 8, "Kokkos::Cuda" )) );
+//}
+
 TEST_F( cuda, hexgrad )
 {
   EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
diff --git a/lib/kokkos/core/perf_test/PerfTestDriver.hpp b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
index 7b6cfc5b5ce96399dcff47e1976b630088650af2..4732c3275a7f92cf1b1fc8f4d457c059ceb0679e 100644
--- a/lib/kokkos/core/perf_test/PerfTestDriver.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
@@ -60,6 +60,342 @@ namespace Test {
 
 enum { NUMBER_OF_TRIALS = 5 };
 
+template< class DeviceType , class LayoutType >
+void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], int range_offset = 0,  int tile_offset = 0 )
+// exp_beg = 6 => 2^6 = 64 is starting range length
+{
+#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0
+
+  std::string label_mdrange ;
+  label_mdrange.append( "\"MDRange< double , " );
+  label_mdrange.append( deviceTypeName );
+  label_mdrange.append( " >\"" );
+
+  std::string label_range_col2 ;
+  label_range_col2.append( "\"RangeColTwo< double , " );
+  label_range_col2.append( deviceTypeName );
+  label_range_col2.append( " >\"" );
+
+  std::string label_range_col_all ;
+  label_range_col_all.append( "\"RangeColAll< double , " );
+  label_range_col_all.append( deviceTypeName );
+  label_range_col_all.append( " >\"" );
+
+  if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Right"
+      << "\n--------------------------------------------------------------" << std::endl;
+  } else {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Left"
+      << "\n--------------------------------------------------------------" << std::endl;
+  }
+
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    const int range_length = (1<<i) + range_offset;
+
+    std::cout << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n"
+      << "MDRange Test:  range bounds: " << range_length << " , " << range_length << " , " << range_length 
+      << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n";
+//      << std::endl;
+
+    int t0_min = 0, t1_min = 0, t2_min = 0;
+    double seconds_min = 0.0;
+
+    // Test 1: The MDRange in full
+    {
+    int t0 = 1, t1 = 1, t2 = 1;
+    int counter = 1;
+#if !defined(KOKKOS_HAVE_CUDA)
+    int min_bnd = 8;
+    int tfast = range_length;
+#else
+    int min_bnd = 2;
+    int tfast = 32;
+#endif
+    while ( tfast >= min_bnd ) {
+      int tmid = min_bnd;
+      while ( tmid < tfast ) { 
+        t0 = min_bnd;
+        t1 = tmid;
+        t2 = tfast;
+        int t2_rev = min_bnd;
+        int t1_rev = tmid;
+        int t0_rev = tfast;
+
+#if defined(KOKKOS_HAVE_CUDA)
+        //Note: Product of tile sizes must be < 1024 for Cuda
+        if ( t0*t1*t2 >= 1024 ) {
+          printf("  Exceeded Cuda tile limits; onto next range set\n\n");
+          break;
+        }
+#endif
+
+        // Run 1 with tiles LayoutRight style
+        double seconds_1 = 0;
+        { seconds_1 = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0, t1, t2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0 << " , " << t1 << " , " << t2
+          << " , " << seconds_1
+          << std::endl ;
+#endif
+
+        if ( counter == 1 ) {
+          seconds_min = seconds_1;
+          t0_min = t0;
+          t1_min = t1;
+          t2_min = t2;
+        } 
+        else {
+          if ( seconds_1 < seconds_min ) 
+          { 
+            seconds_min = seconds_1; 
+            t0_min = t0;
+            t1_min = t1;
+            t2_min = t2;
+          }
+        }
+
+        // Run 2 with tiles LayoutLeft style - reverse order of tile dims
+        double seconds_1rev = 0;
+        { seconds_1rev = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0_rev, t1_rev, t2_rev) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0_rev << " , " << t1_rev << " , " << t2_rev
+          << " , " << seconds_1rev
+          << std::endl ;
+#endif
+
+        if ( seconds_1rev < seconds_min ) 
+        { 
+          seconds_min = seconds_1rev; 
+          t0_min = t0_rev;
+          t1_min = t1_rev;
+          t2_min = t2_rev;
+        }
+
+        ++counter;
+        tmid <<= 1;
+      } //end inner while
+      tfast >>=1;
+    } //end outer while
+
+    std::cout << "\n"
+      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0_min << " , " << t1_min << " , " << t2_min
+      << "\n Min time: " << seconds_min
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    } //end scope
+
+#if !defined(KOKKOS_HAVE_CUDA)
+  double seconds_min_c = 0.0;
+  int t0c_min = 0, t1c_min = 0, t2c_min = 0;
+  int counter = 1;
+  {
+    int min_bnd = 8;
+    // Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize the full span in that direction, should be similar to Collapse<2>
+    if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value ) {
+      for ( unsigned int T0 = min_bnd; T0 < static_cast<unsigned int>(range_length); T0<<=1 ) {
+        for ( unsigned int T1 = min_bnd; T1 < static_cast<unsigned int>(range_length); T1<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, T0, T1, 0) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LR with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " << T0 << " , " << T1 << " , " << range_length
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+          t2c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t0c_min = T0;
+            t1c_min = T1;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t0c_min = T0;
+              t1c_min = T1;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+    else {
+      for ( unsigned int T1 = min_bnd; T1 <= static_cast<unsigned int>(range_length); T1<<=1 ) {
+        for ( unsigned int T2 = min_bnd; T2 <= static_cast<unsigned int>(range_length); T2<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, 0, T1, T2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LL with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " <<range_length << " < " << T1 << " , " << T2
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+
+          t0c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t1c_min = T1;
+            t2c_min = T2;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t1c_min = T1;
+              t2c_min = T2;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+
+    std::cout 
+//      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "  Collapse<2> style: "
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0c_min << " , " << t1c_min << " , " << t2c_min
+      << "\n Min time: " << seconds_min_c
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+  } //end scope test 2
+#endif
+
+
+    // Test 2: RangePolicy Collapse2 style
+    double seconds_2 = 0;
+    { seconds_2 = RangePolicyCollapseTwo< DeviceType , double , LayoutType >::test_index_collapse_two(range_length,range_length,range_length) ; }
+    std::cout << label_range_col2
+      << " , " << range_length
+      << " , " << seconds_2
+      << std::endl ;
+
+
+    // Test 3: RangePolicy Collapse all style - not necessary, always slow
+    /*
+    double seconds_3 = 0;
+    { seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType >::test_collapse_all(range_length,range_length,range_length) ; }
+    std::cout << label_range_col_all
+      << " , " << range_length
+      << " , " << seconds_3
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    */
+
+    // Compare fastest times... will never be collapse all so ignore it
+    // seconds_min = tiled MDRange
+    // seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - only for non-Cuda, else tile too long
+    // seconds_2 = collapse<2>-style RangePolicy
+    // seconds_3 = collapse<3>-style RangePolicy
+
+#if !defined(KOKKOS_HAVE_CUDA)
+    if ( seconds_min < seconds_min_c ) {
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    }
+    else if ( seconds_min > seconds_min_c ) {
+      if ( seconds_min_c < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange collapse-like (tiledim = span on fast dim) type\n"
+          << " Time: " << seconds_min_c
+          << " Difference: " << seconds_2 - seconds_min_c
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min_c > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min_c - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    } // end else if
+#else
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+#endif
+
+  } //end for
+
+#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+
+}
 
 
 template< class DeviceType >
diff --git a/lib/kokkos/core/perf_test/PerfTestHost.cpp b/lib/kokkos/core/perf_test/PerfTestHost.cpp
index 606177ca50effc8a6cf88ced253ce2e1ea9930a2..831d581109984319a4c8a61674a42a297ace443a 100644
--- a/lib/kokkos/core/perf_test/PerfTestHost.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp
@@ -66,6 +66,8 @@ const char TestHostDeviceName[] = "Kokkos::Serial" ;
 
 #include <impl/Kokkos_Timer.hpp>
 
+#include <PerfTestMDRange.hpp>
+
 #include <PerfTestHexGrad.hpp>
 #include <PerfTestBlasKernels.hpp>
 #include <PerfTestGramSchmidt.hpp>
@@ -102,6 +104,14 @@ protected:
   }
 };
 
+//TEST_F( host, mdrange_lr ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutRight> (5, 8, TestHostDeviceName) ) );
+//}
+
+//TEST_F( host, mdrange_ll ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutLeft> (5, 8, TestHostDeviceName) ) );
+//}
+
 TEST_F( host, hexgrad ) {
   EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
 }
diff --git a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d910b513c67f94eec4c1254fd4528ec4d74c62a5
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
@@ -0,0 +1,564 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct MultiDimRangePerf3D
+{
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long i, const long j, const long k) const
+  {
+    A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                             + B(i,j+2,k) + B(i,j+1,k)
+                             + B(i,j,k+2) + B(i,j,k+1)
+                             + B(i,j,k) );
+  }
+
+
+  struct InitZeroTag {};
+//  struct InitViewTag {};
+
+  struct Init
+  {
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 1.0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const InitZeroTag&, const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 0;
+    }
+
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+  };
+
+
+  static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1)
+  {
+    //This test performs multidim range over all dims
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef MultiDimRangePerf3D<execution_space,ScalarType,TestLayout> FunctorType;
+
+    double dt_min = 0;
+
+    // LayoutRight
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > MDRangeType;
+      using tile_type = typename MDRangeType::tile_type;
+      using point_type = typename MDRangeType::point_type;
+
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << "LR multi: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+      }
+    } //end for
+
+    } 
+    // LayoutLeft
+    else {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      //typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
+      //using tile_type = typename MDRangeType::tile_type;
+      //using point_type = typename MDRangeType::point_type;
+      //Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} ); 
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " LL multi run: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+
+      }
+    } //end for
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseTwo
+{
+  // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+//id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r
+//r = j + i*Nj
+      long i = int(r / jrange); 
+      long j = int( r - i*jrange);
+      for (int k = 0; k < krange; ++k) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+//id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r
+//r = j + k*Nj
+      long k = int(r / jrange); 
+      long j = int( r - k*jrange);
+      for (int i = 0; i < irange; ++i) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / jrange); 
+        long j = int( r - i*jrange);
+        for (int k = 0; k < krange; ++k) {
+          input(i,j,k) = 1;
+        }
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / jrange); 
+        long j = int( r - k*jrange);
+        for (int i = 0; i < irange; ++i) {
+          input(i,j,k) = 1;
+        }
+      }
+    }
+  };
+
+
+  static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    // This test refers to collapsing two dims while using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseTwo<execution_space,ScalarType,TestLayout> FunctorType;
+
+    long collapse_index_rangeA = 0;
+    long collapse_index_rangeB = 0;
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      collapse_index_rangeA = icount*jcount;
+      collapse_index_rangeB = (icount+2)*(jcount+2);
+//      std::cout << "   LayoutRight " << std::endl;
+    } else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value ) {
+      collapse_index_rangeA = kcount*jcount;
+      collapse_index_rangeB = (kcount+2)*(jcount+2);
+//      std::cout << "   LayoutLeft " << std::endl;
+    } else {
+      std::cout << "  LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl;
+      exit(-1);
+    }
+
+    Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA) );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (collapse_index_rangeB) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse2: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse2: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseAll
+{
+  // RangePolicy for 3D range, but will collapse all dims
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+      long i = int(r / (jrange*krange)); 
+      long j = int(( r - i*jrange*krange)/krange);
+      long k = int(r - i*jrange*krange - j*krange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+      long k = int(r / (irange*jrange)); 
+      long j = int(( r - k*irange*jrange)/irange);
+      long i = int(r - k*irange*jrange - j*irange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / (jrange*krange)); 
+        long j = int(( r - i*jrange*krange)/krange);
+        long k = int(r - i*jrange*krange - j*krange);
+        input(i,j,k) = 1;
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / (irange*jrange));
+        long j = int(( r - k*irange*jrange)/irange);
+        long i = int(r - k*irange*jrange - j*irange);
+        input(i,j,k) = 1;
+      }
+    }
+  };
+
+
+  static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    //This test refers to collapsing all dims using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseAll<execution_space,ScalarType,TestLayout> FunctorType;
+
+    const long flat_index_range = icount*jcount*kcount;
+    Kokkos::RangePolicy<execution_space> policy(0, flat_index_range );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Callapse ALL Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse all: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse all: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+} //end namespace Test
diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt
index 807a01ed01b128c531b87df0c27e1d406525b603..492470d05d07ee5684a04bff54fc103e82708ba9 100644
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@@ -92,13 +92,13 @@ LIST(APPEND SOURCES         ${SOURCES_CUDA} )
 INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
 
 #-----------------------------------------------------------------------------
-FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
-FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
+FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp)
+FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp)
 
-LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
-LIST(APPEND SOURCES         ${SOURCES_QTHREAD} )
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} )
+LIST(APPEND SOURCES         ${SOURCES_QTHREADS} )
 
-INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
+INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/)
 
 #-----------------------------------------------------------------------------
 
@@ -109,5 +109,3 @@ TRIBITS_ADD_LIBRARY(
     SOURCES ${SOURCES}
     DEPLIBS
     )
-
-
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0eadb25a005f09e1c9d37400bd76a611cc4eb3b
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@@ -0,0 +1,1300 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
+#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+
+//#include<Cuda/Kokkos_CudaExec.hpp>
+// Including the file above, leads to following type of errors:
+// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
+// As a result, recreate cuda_parallel_launch and associated code
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+namespace Kokkos { namespace Experimental { namespace Impl {
+
+// ------------------------------------------------------------------ //
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType >
+struct CudaLaunch
+{
+  inline
+  CudaLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+            )
+  {
+    cuda_parallel_launch< DriverType ><<< grid , block >>>(driver);
+  }
+
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag >
+struct apply_impl;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+  if (RP::inner_direction == RP::Left) {
+ /*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+            m_func(i, j);
+    } }
+*/
+    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+            m_func(offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+// LR
+  else {
+/*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+            m_func(i, j);
+    } }
+*/
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+            m_func(offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+  if (RP::inner_direction == RP::Left) {
+    // Loop over size maxnumblocks until full range covered
+/*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+            m_func(Tag(), i, j);
+    } }
+*/
+    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+            m_func(Tag(), offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+  else {
+/*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+            m_func(Tag(), i, j);
+    } }
+*/
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+            m_func(Tag(), offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+  else {
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+            for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+              if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+                m_func(offset_0 , offset_1 , offset_2);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z % numbl4;
+      const index_type tile_id5 = blockIdx.z / numbl4;
+      const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z / numbl5;
+      const index_type tile_id5 = blockIdx.z % numbl5;
+      const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z % numbl4;
+      const index_type tile_id5 = blockIdx.z / numbl4;
+      const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z / numbl5;
+      const index_type tile_id5 = blockIdx.z % numbl5;
+      const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// ----------------------------------------------------------------------------------
+
+template < typename RP
+         , typename Functor
+         , typename Tag
+         >
+struct DeviceIterateTile
+{
+  using index_type = typename RP::index_type;
+  using array_index_type = typename RP::array_index_type;
+  using point_type = typename RP::point_type;
+
+  struct VoidDummy {};
+  typedef typename std::conditional< std::is_same<Tag, void>::value, VoidDummy, Tag>::type usable_tag;
+
+  DeviceIterateTile( const RP & rp, const Functor & func )
+    : m_rp{rp}
+    , m_func{func}
+  {}
+
+private:
+  inline __device__
+  void apply() const
+  {
+    apply_impl<RP::rank,RP,Functor,Tag>(m_rp,m_func).exec_range();
+  } //end apply
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    this-> apply();
+  }
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = 65535; //not true for blockIdx.x for newer archs
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        ,  std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast<index_type>(maxblocks) )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+protected:
+  const RP         m_rp;
+  const Functor    m_func;
+};
+
+} } } //end namespace Kokkos::Experimental::Impl
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index 0a0f41686bab1232f0bebe9e66dc4f6b08c76d6b..a273db998ba808726f4d9b5bc17bfc10347952ed 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -131,6 +131,7 @@ namespace Impl {
     int* atomic;
     int* scratch;
     int* threadid;
+    int n;
   };
 }
 }
@@ -250,6 +251,7 @@ struct CudaParallelLaunch< DriverType , true > {
       locks.atomic = atomic_lock_array_cuda_space_ptr(false);
       locks.scratch = scratch_lock_array_cuda_space_ptr(false);
       locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      locks.n = Kokkos::Cuda::concurrency();
       cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
       #endif
 
@@ -292,6 +294,7 @@ struct CudaParallelLaunch< DriverType , false > {
       locks.atomic = atomic_lock_array_cuda_space_ptr(false);
       locks.scratch = scratch_lock_array_cuda_space_ptr(false);
       locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      locks.n = Kokkos::Cuda::concurrency();
       cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
       #endif
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 91a3c921381709fc0ade5776b03ef48a2abcfe67..303b3fa4f699f0e56c7d44682197bd050b2ac7ca 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -59,7 +59,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 
@@ -184,7 +184,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
 
   enum { max_uvm_allocations = 65536 };
 
-  if ( arg_alloc_size > 0 ) 
+  if ( arg_alloc_size > 0 )
   {
     Kokkos::Impl::num_uvm_allocations++;
 
@@ -193,7 +193,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
     }
 
     CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
-  } 
+  }
 
   return ptr ;
 }
@@ -375,7 +375,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::CudaSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
 
     SharedAllocationHeader header ;
@@ -395,7 +395,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
 SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::fence(); //Make sure I can access the label ...
     Kokkos::Profiling::deallocateData(
@@ -412,7 +412,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::deallocateData(
       Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
@@ -442,7 +442,7 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
   , m_tex_obj( 0 )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -479,7 +479,7 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
   , m_tex_obj( 0 )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -510,7 +510,7 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
       )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -745,14 +745,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
       //Formatting dependent on sizeof(uintptr_t)
       const char * format_string;
 
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
         format_string = "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
       }
-      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
         format_string = "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
       }
 
-      snprintf( buffer , 256 
+      snprintf( buffer , 256
               , format_string
               , reinterpret_cast<uintptr_t>( r )
               , reinterpret_cast<uintptr_t>( r->m_prev )
@@ -776,14 +776,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
         //Formatting dependent on sizeof(uintptr_t)
         const char * format_string;
 
-        if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
           format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
         }
-        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
           format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
         }
 
-        snprintf( buffer , 256 
+        snprintf( buffer , 256
                 , format_string
                 , reinterpret_cast< uintptr_t >( r->data() )
                 , r->size()
@@ -883,6 +883,7 @@ void init_lock_arrays_cuda_space() {
     locks.atomic = atomic_lock_array_cuda_space_ptr(false);
     locks.scratch = scratch_lock_array_cuda_space_ptr(false);
     locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+    locks.n = Kokkos::Cuda::concurrency();
     cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
     init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
     init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
index eeea97049fa3e8ba949fb9aed7841b4639bea928..44d908d1023197c5a8d0232a3d13ff49d06ef8d9 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -505,18 +505,18 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
       std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
       std::cout << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
       std::cout << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
-      std::cout << "                                  or will likely crash when accessing data on the host." << std::endl; 
+      std::cout << "                                  or will likely crash when accessing data on the host." << std::endl;
     }
 
     const char * env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
     bool force_device_alloc;
     if (env_force_device_alloc == 0) force_device_alloc=false;
     else force_device_alloc=atoi(env_force_device_alloc)!=0;
-  
+
     const char * env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
     bool visible_devices_one=true;
     if (env_visible_devices == 0) visible_devices_one=false;
-    
+
     if(!visible_devices_one && !force_device_alloc) {
       std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
       std::cout << "                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
@@ -536,6 +536,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
   locks.atomic = atomic_lock_array_cuda_space_ptr(false);
   locks.scratch = scratch_lock_array_cuda_space_ptr(false);
   locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+  locks.n = Kokkos::Cuda::concurrency();
   cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
   #endif
 }
@@ -620,9 +621,9 @@ void CudaInternal::finalize()
   was_finalized = 1;
   if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
 
-    atomic_lock_array_cuda_space_ptr(false);
-    scratch_lock_array_cuda_space_ptr(false);
-    threadid_lock_array_cuda_space_ptr(false);
+    atomic_lock_array_cuda_space_ptr(true);
+    scratch_lock_array_cuda_space_ptr(true);
+    threadid_lock_array_cuda_space_ptr(true);
 
     if ( m_stream ) {
       for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@@ -700,7 +701,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
 {
   Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
 }
@@ -739,7 +740,7 @@ void Cuda::finalize()
 {
   Impl::CudaInternal::singleton().finalize();
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
   #endif
 }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index fa29d732f473d727b5ac8beb81c8602d0e715914..56e6a3c1e34123d8fc58dbfffea0574acea31047 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -61,7 +61,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <Kokkos_Vectorization.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <typeinfo>
 #endif
@@ -586,13 +586,35 @@ public:
   void operator()(void) const
   {
     // Iterate this block through the league
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
+        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+
     for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
 
       this-> template exec_team< WorkTag >(
         typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
                                     , m_shmem_begin
                                     , m_shmem_size
-                                    , m_scratch_ptr[1]
+                                    , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                     , m_scratch_size[1]
                                     , league_rank
                                     , m_league_size ) );
@@ -946,11 +968,32 @@ public:
 
   __device__ inline
   void operator() () const {
-    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
+        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
   }
 
   __device__ inline
-  void run(const DummySHMEMReductionType&) const
+  void run(const DummySHMEMReductionType&, const int& threadid) const
   {
     const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
       word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
@@ -964,7 +1007,7 @@ public:
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
                                         , m_shmem_size
-                                        , m_scratch_ptr[1]
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                         , m_scratch_size[1]
                                         , league_rank
                                         , m_league_size )
@@ -992,7 +1035,7 @@ public:
   }
 
   __device__ inline
-  void run(const DummyShflReductionType&) const
+  void run(const DummyShflReductionType&, const int& threadid) const
   {
     value_type value;
     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
@@ -1003,7 +1046,7 @@ public:
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
                                         , m_shmem_size
-                                        , m_scratch_ptr[1]
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                         , m_scratch_size[1]
                                         , league_rank
                                         , m_league_size )
@@ -1128,9 +1171,9 @@ public:
       Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
     }
 
-    if ( m_team_size >
-         Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
-               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
+    if ( unsigned(m_team_size) >
+         unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
       Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
     }
 
@@ -1621,14 +1664,25 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Cuda
 #endif
 }
 
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+/** \brief  Intra-thread vector parallel_reduce.
  *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
+ *  Calls lambda(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
+void parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+      const & loop_boundaries
+  , Lambda const & lambda
+  , ValueType & result )
+{
 #ifdef __CUDA_ARCH__
   result = ValueType();
 
@@ -1636,52 +1690,42 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::C
     lambda(i,result);
   }
 
-  if (loop_boundaries.increment > 1)
-    result += shfl_down(result, 1,loop_boundaries.increment);
-  if (loop_boundaries.increment > 2)
-    result += shfl_down(result, 2,loop_boundaries.increment);
-  if (loop_boundaries.increment > 4)
-    result += shfl_down(result, 4,loop_boundaries.increment);
-  if (loop_boundaries.increment > 8)
-    result += shfl_down(result, 8,loop_boundaries.increment);
-  if (loop_boundaries.increment > 16)
-    result += shfl_down(result, 16,loop_boundaries.increment);
-
-  result = shfl(result,0,loop_boundaries.increment);
+  Impl::cuda_intra_warp_vector_reduce(
+    Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & result ) );
+
 #endif
 }
 
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+/** \brief  Intra-thread vector parallel_reduce.
  *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
+ *  Calls lambda(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed
+ *  using JoinType::operator()(ValueType& val, const ValueType& update)
+ *  and output into result.
+ *
+ *  The input value of result must be the identity value for the
+ *  reduction operation; e.g., ( 0 , += ) or ( 1 , *= ).
+ */
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
+void parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+      const & loop_boundaries
+  , Lambda const & lambda
+  , JoinType const & join
+  , ValueType & result )
+{
 #ifdef __CUDA_ARCH__
-  ValueType result = init_result;
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
     lambda(i,result);
   }
 
-  if (loop_boundaries.increment > 1)
-    join( result, shfl_down(result, 1,loop_boundaries.increment));
-  if (loop_boundaries.increment > 2)
-    join( result, shfl_down(result, 2,loop_boundaries.increment));
-  if (loop_boundaries.increment > 4)
-    join( result, shfl_down(result, 4,loop_boundaries.increment));
-  if (loop_boundaries.increment > 8)
-    join( result, shfl_down(result, 8,loop_boundaries.increment));
-  if (loop_boundaries.increment > 16)
-    join( result, shfl_down(result, 16,loop_boundaries.increment));
-
-  init_result = shfl(result,0,loop_boundaries.increment);
+  Impl::cuda_intra_warp_vector_reduce(
+    Impl::Reducer< ValueType , JoinType >( join , & result ) );
+
 #endif
 }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index ad9cca26ce2463df58820da78a3fb2e16c2a351c..79b3867ba24a87e787faac051c21abf6a99795de 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -55,15 +55,163 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 namespace Impl {
 
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl( *reinterpret_cast<int const *>(&in) , lane , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
 
+//----------------------------------------------------------------------------
 
-//Shfl based reductions
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Reduce within a warp over blockDim.x, the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Requires:
+ *    blockDim.x is power of 2
+ *    blockDim.x <= 32 (one warp)
+ *
+ *  Cannot use "butterfly" pattern because floating point
+ *  addition is non-associative.  Therefore, must broadcast
+ *  the final result.
+ */
+template< class Reducer >
+__device__ inline
+void cuda_intra_warp_vector_reduce( Reducer const & reducer )
+{
+  static_assert(
+    std::is_reference< typename Reducer::reference_type >::value , "" );
+
+  if ( 1 < blockDim.x ) {
+
+    typename Reducer::value_type tmp ;
+
+    for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
+
+      cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
+
+      if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
+    }
+
+    // Broadcast from root "lane" to all other "lanes"
+
+    cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
+  }
+}
+
+/** \brief  Inclusive scan over blockDim.x, the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Algorithm is concurrent bottom-up reductions in triangular pattern
+ *  where each CUDA thread is the root of a reduction tree from the
+ *  zeroth CUDA thread to itself.
+ *
+ *  Requires:
+ *    blockDim.x is power of 2
+ *    blockDim.x <= 32 (one warp)
+ */
+template< typename ValueType >
+__device__ inline
+void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
+{
+  ValueType tmp ;
+
+  // Bottom up:
+  //   [t] += [t-1] if t >= 1
+  //   [t] += [t-2] if t >= 2
+  //   [t] += [t-4] if t >= 4
+  // ...
+
+  for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
+
+    cuda_shfl_up( tmp , local , i , blockDim.x );
+
+    if ( i <= threadIdx.x ) { local += tmp ; }
+  }
+}
+
+//----------------------------------------------------------------------------
 /*
  *  Algorithmic constraints:
  *   (a) threads with same threadIdx.y have same value
@@ -98,7 +246,10 @@ inline void cuda_inter_warp_reduction( ValueType& value,
                                        const int max_active_thread = blockDim.y) {
 
   #define STEP_WIDTH 4
-  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
+  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
+  // The reason not to use ValueType directly is that for types with constructors it 
+  // could lead to race conditions
+  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
   ValueType* result = (ValueType*) & sh_result;
   const unsigned step = 32 / blockDim.x;
   unsigned shift = STEP_WIDTH;
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
index c96b8b7d40666830032ee560840cddcc9e52fe04..cf3e55d50cf416cbb6a268c85602e7c7dd8fa4e2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -91,7 +91,7 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
       // Loop by priority and then type
       for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
         for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
-          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
+          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
         }
       }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 479294f3078a4e0d055610cb38b599415bbac921..a13e37837d8005867f1087b827a4d7e59ebd3209 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -61,6 +61,8 @@ void set_cuda_task_base_apply_function_pointer
 
 }
 
+template< class > class TaskExec ;
+
 template<>
 class TaskQueueSpecialization< Kokkos::Cuda >
 {
@@ -69,6 +71,7 @@ public:
   using execution_space = Kokkos::Cuda ;
   using memory_space    = Kokkos::CudaUVMSpace ;
   using queue_type      = TaskQueue< execution_space > ;
+  using member_type     = TaskExec< Kokkos::Cuda > ;
 
   static
   void iff_single_thread_recursive_execute( queue_type * const ) {}
@@ -79,13 +82,15 @@ public:
   static
   void execute( queue_type * const );
 
-  template< typename FunctorType >
+  template< typename TaskType >
   static
-  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
+  typename TaskType::function_type
+  get_function_pointer()
     {
-      using TaskType = TaskBase< execution_space
-                               , typename FunctorType::value_type
-                               , FunctorType > ;
+      using function_type = typename TaskType::function_type ;
+
+      function_type * const ptr =
+        (function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
 
       CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 
@@ -93,6 +98,8 @@ public:
 
       CUDA_SAFE_CALL( cudaGetLastError() );
       CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      return *ptr ;
     }
 };
 
@@ -435,18 +442,26 @@ void parallel_reduce
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename ValueType, typename iType, class Lambda >
+template< typename iType, class Closure >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
   (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
-   const Lambda & lambda) {
+   const Closure & closure )
+{
+  // Extract value_type from closure
 
-  ValueType accum = 0 ;
-  ValueType val, y, local_total;
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+  value_type val, y, local_total;
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
     val = 0;
-    lambda(i,val,false);
+    closure(i,val,false);
 
     // intra-blockDim.y exclusive scan on 'val'
     // accum = accumulated, sum in total for this iteration
@@ -458,7 +473,7 @@ void parallel_scan
     }
 
     // pass accum to all threads
-    local_total = shfl_warp_broadcast<ValueType>(val,
+    local_total = shfl_warp_broadcast<value_type>(val,
                                             threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
                                             Impl::CudaTraits::WarpSize);
 
@@ -467,7 +482,7 @@ void parallel_scan
     if ( threadIdx.y == 0 ) { val = 0 ; }
 
     val += accum;
-    lambda(i,val,true);
+    closure(i,val,true);
     accum += local_total;
   }
 }
@@ -478,18 +493,26 @@ void parallel_scan
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Lambda, typename ValueType >
+template< typename iType, class Closure >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
   (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
-   const Lambda & lambda)
+   const Closure & closure )
 {
-  ValueType accum = 0 ;
-  ValueType val, y, local_total;
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+  value_type val, y, local_total;
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
     val = 0;
-    lambda(i,val,false);
+    closure(i,val,false);
 
     // intra-blockDim.x exclusive scan on 'val'
     // accum = accumulated, sum in total for this iteration
@@ -501,14 +524,14 @@ void parallel_scan
     }
 
     // pass accum to all threads
-    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
+    local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);
 
     // make EXCLUSIVE scan by shifting values over one
     val = Kokkos::shfl_up(val, 1, blockDim.x);
     if ( threadIdx.x == 0 ) { val = 0 ; }
 
     val += accum;
-    lambda(i,val,true);
+    closure(i,val,true);
     accum += local_total;
   }
 }
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index 4e1ce855c5efc9f8ecb414096b87ea14728967f9..a450ca36ae1bb0049c2abd142e20733edcaf2f7c 100644
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -44,36 +44,47 @@
 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 
+#include <initializer_list>
+
+#include<impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <initializer_list>
 
-#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_MDRANGE_IVDEP
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
 #endif
 
 namespace Kokkos { namespace Experimental {
 
+// ------------------------------------------------------------------ //
+
 enum class Iterate
 {
   Default, // Default for the device
   Left,    // Left indices stride fastest
   Right,   // Right indices stride fastest
-  Flat,    // Do not tile, only valid for inner direction
 };
 
 template <typename ExecSpace>
 struct default_outer_direction
 {
   using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
   static constexpr Iterate value = Iterate::Right;
+  #endif
 };
 
 template <typename ExecSpace>
 struct default_inner_direction
 {
   using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
   static constexpr Iterate value = Iterate::Right;
+  #endif
 };
 
 
@@ -86,7 +97,7 @@ struct Rank
 {
   static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
   static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
+  static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
 
   using iteration_pattern = Rank<N, OuterDir, InnerDir>;
 
@@ -96,515 +107,370 @@ struct Rank
 };
 
 
-
 // multi-dimensional iteration pattern
 template <typename... Properties>
 struct MDRangePolicy
+  : public Kokkos::Impl::PolicyTraits<Properties ...>
 {
+  using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
   using range_policy = RangePolicy<Properties...>;
 
-  static_assert( !std::is_same<range_policy,void>::value
+  using impl_range_policy = RangePolicy< typename traits::execution_space
+                                       , typename traits::schedule_type
+                                       , typename traits::index_type
+                                       > ;
+
+  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
                , "Kokkos Error: MD iteration pattern not defined" );
 
-  using iteration_pattern   = typename range_policy::iteration_pattern;
-  using work_tag            = typename range_policy::work_tag;
+  using iteration_pattern   = typename traits::iteration_pattern;
+  using work_tag            = typename traits::work_tag;
 
   static constexpr int rank = iteration_pattern::rank;
 
   static constexpr int outer_direction = static_cast<int> (
-      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
+      (iteration_pattern::outer_direction != Iterate::Default)
     ? iteration_pattern::outer_direction
-    : default_outer_direction< typename range_policy::execution_space>::value );
+    : default_outer_direction< typename traits::execution_space>::value );
 
   static constexpr int inner_direction = static_cast<int> (
       iteration_pattern::inner_direction != Iterate::Default
     ? iteration_pattern::inner_direction
-    : default_inner_direction< typename range_policy::execution_space>::value ) ;
+    : default_inner_direction< typename traits::execution_space>::value ) ;
 
 
   // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Flat = static_cast<int>( Iterate::Flat );
   static constexpr int Right = static_cast<int>( Iterate::Right );
-
-
-  using size_type   = typename range_policy::index_type;
-  using index_type  = typename std::make_signed<size_type>::type;
-
-
-  template <typename I>
-  MDRangePolicy( std::initializer_list<I> upper_corner )
+  static constexpr int Left  = static_cast<int>( Iterate::Left );
+
+  using index_type  = typename traits::index_type;
+  using array_index_type = long;
+  using point_type  = Kokkos::Array<array_index_type,rank>; //was index_type
+  using tile_type   = Kokkos::Array<array_index_type,rank>;
+  // If point_type or tile_type is not templated on a signed integral type (if it is unsigned), 
+  // then if user passes in intializer_list of runtime-determined values of 
+  // signed integral type that are not const will receive a compiler error due 
+  // to an invalid case for implicit conversion - 
+  // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
+  // This would require the user to either pass a matching index_type parameter
+  // as template parameter to the MDRangePolicy or static_cast the individual values
+
+  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
+    : m_lower(lower)
+    , m_upper(upper)
+    , m_tile(tile)
+    , m_num_tiles(1)
   {
-    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-
-    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
-
-    const auto u = upper_corner.begin();
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(0);
-      m_dim[i]    = static_cast<index_type>(u[i]);
-      if (inner_direction != Flat) {
-        // default tile size to 4
-        m_tile[i] = 4;
-      } else {
-        m_tile[i] = 1;
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
+       )
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = span;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
       }
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
     }
-  }
-
-  template <typename IA, typename IB>
-  MDRangePolicy( std::initializer_list<IA> corner_a
-               , std::initializer_list<IB> corner_b
-               )
-  {
-    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
-    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
-    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
-
-
-    using A = typename std::make_signed<IA>::type;
-    using B = typename std::make_signed<IB>::type;
-
-    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
-    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
-      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
-      if (inner_direction != Flat) {
-        // default tile size to 4
-        m_tile[i] = 4;
-      } else {
-        m_tile[i] = 1;
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+      index_type total_tile_size_check = 1;
+      for (int i=0; i<rank; ++i) {
+        total_tile_size_check *= m_tile[i];
+      }
+      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
       }
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
-    }
-  }
-
-  template <typename IA, typename IB, typename T>
-  MDRangePolicy( std::initializer_list<IA> corner_a
-               , std::initializer_list<IB> corner_b
-               , std::initializer_list<T> tile
-               )
-  {
-    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
-    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
-    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
-    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
-    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
-    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
-
-    using A = typename std::make_signed<IA>::type;
-    using B = typename std::make_signed<IB>::type;
-
-    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
-    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
-    const auto t = tile.begin();
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
-      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
-      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
     }
+    #endif
   }
 
-  index_type   m_offset[rank];
-  index_type   m_dim[rank];
-  int          m_tile[rank];
-  index_type   m_tile_dim[rank];
-  size_type    m_num_tiles;       // product of tile dims
-};
-
-namespace Impl {
 
-// Serial, Threads, OpenMP
-// use enable_if to overload for Cuda
-template < typename MDRange, typename Functor, typename Enable = void >
-struct MDForFunctor
-{
-  using work_tag   = typename MDRange::work_tag;
-  using index_type = typename MDRange::index_type;
-  using size_type  = typename MDRange::size_type;
-
-  MDRange m_range;
-  Functor m_func;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange const& range, Functor const& f )
-    : m_range(range)
-    , m_func( f )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange const& range, Functor && f )
-    : m_range(range)
-    , m_func( std::forward<Functor>(f) )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange && range, Functor const& f )
-    : m_range( std::forward<MDRange>(range) )
-    , m_func( f )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange && range, Functor && f )
-    : m_range( std::forward<MDRange>(range) )
-    , m_func( std::forward<Functor>(f) )
-  {}
-
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDForFunctor const& ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor& operator=( MDForFunctor const& ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDForFunctor && ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor& operator=( MDForFunctor && ) = default;
-
-  // Rank-2, Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
+  template < typename LT , typename UT , typename TT = array_index_type >
+  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
   {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
-            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
-    } else {
-      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+#if 0
+    // This should work, less duplicated code but not yet extensively tested
+    point_type lower_tmp, upper_tmp;
+    tile_type tile_tmp;
+    for ( auto i = 0; i < rank; ++i ) {
+      lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
+      upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
+      tile_tmp[i]  = static_cast<array_index_type>(tile.begin()[i]);
     }
-  }
 
-  // Rank-2, Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
-            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
-    } else {
-      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
-    }
-  }
+    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
 
-  // Rank-2, Not Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    index_type t0, t1;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      t0 = t / m_range.m_tile_dim[1];
-      t1 = t % m_range.m_tile_dim[1];
-    } else {
-      t0 = t % m_range.m_tile_dim[0];
-      t1 = t / m_range.m_tile_dim[0];
-    }
+#else
+    if(m_lower.size()!=rank || m_upper.size() != rank)
+      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i1=b1; i1<e1; ++i1) {
-        m_func( i0, i1 );
-      }}
-    } else {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( i0, i1 );
-      }}
+    for ( auto i = 0; i < rank; ++i ) {
+      m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
+      m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
+      if(tile.size()==rank)
+        m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
+      else
+        m_tile[i] = 0;
     }
-  }
 
-  // Rank-2, Not Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    work_tag tag;
-
-    index_type t0, t1;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      t0 = t / m_range.m_tile_dim[1];
-      t1 = t % m_range.m_tile_dim[1];
-    } else {
-      t0 = t % m_range.m_tile_dim[0];
-      t1 = t / m_range.m_tile_dim[0];
-    }
+    m_num_tiles = 1;
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i1=b1; i1<e1; ++i1) {
-        m_func( tag, i0, i1 );
-      }}
-    } else {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( tag, i0, i1 );
-      }}
-    }
-  }
 
-  //---------------------------------------------------------------------------
-
-  // Rank-3, Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
-    m_func( m_range.m_offset[0] + (  t / tmp_prod )
-          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
-          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
-          );
-    } else {
-    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
-    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
-          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
-          , m_range.m_offset[2] + (  t / tmp_prod )
-          );
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
+       )
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = span;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
     }
-  }
-
-  // Rank-3, Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
-      m_func( work_tag{}
-            , m_range.m_offset[0] + (  t / tmp_prod )
-            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
-            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
-            );
-    } else {
-      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
-      m_func( work_tag{}
-            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
-            , m_range.m_offset[2] + (  t / tmp_prod )
-            );
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+      index_type total_tile_size_check = 1;
+      for (int i=0; i<rank; ++i) {
+        total_tile_size_check *= m_tile[i];
+      }
+      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
     }
+    #endif
+#endif
   }
 
-  // Rank-3, Not Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    index_type t0, t1, t2;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
-      t0 = t / tmp_prod;
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
-      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
-    } else {
-      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
-      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
-      t2 = t / tmp_prod;
-    }
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i2=b2; i2<e2; ++i2) {
-        m_func( i0, i1, i2 );
-      }}}
-    } else {
-      for (int i2=b2; i2<e2; ++i2) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( i0, i1, i2 );
-      }}}
-    }
-  }
+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
+};
+// ------------------------------------------------------------------ //
 
-  // Rank-3, Not Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    work_tag tag;
-
-    index_type t0, t1, t2;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
-      t0 = t / tmp_prod;
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
-      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
-    } else {
-      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
-      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
-      t2 = t / tmp_prod;
-    }
+// ------------------------------------------------------------------ //
+//md_parallel_for
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename Enable = void>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i2=b2; i2<e2; ++i2) {
-        m_func( tag, i0, i1, i2 );
-      }}}
-    } else {
-      for (int i2=b2; i2<e2; ++i2) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( tag, i0, i1, i2 );
-      }}}
-    }
-  }
-};
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
 
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
 
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
 
-} // namespace Impl
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
 
+// Cuda specialization
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
 
 template <typename MDRange, typename Functor>
 void md_parallel_for( MDRange const& range
                     , Functor const& f
                     , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
                     )
 {
-  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
+#endif
+// ------------------------------------------------------------------ //
 
-  using range_policy = typename MDRange::range_policy;
+// ------------------------------------------------------------------ //
+//md_parallel_reduce
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
 
-  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
 
-template <typename MDRange, typename Functor>
-void md_parallel_for( const std::string& str
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
                     , MDRange const& range
                     , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
                     )
 {
-  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
 
-  using range_policy = typename MDRange::range_policy;
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
 
-  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
 
+// Cuda - parallel_reduce not implemented yet
+/*
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
+  closure.execute();
+}
+
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
+  closure.execute();
+}
+*/
+
 }} // namespace Kokkos::Experimental
 
 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp
index 8deb5142c4352021c4305b422508b21f8524e108..abb263b7ccd7d6f82f469d06fadbc2326fe21438 100644
--- a/lib/kokkos/core/src/Kokkos_Array.hpp
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@@ -59,8 +59,14 @@ template< class T      = void
         , class Proxy  = void
         >
 struct Array {
-private:
-  T m_elem[N];
+public:
+  /**
+   * The elements of this C array shall not be accessed directly. The data
+   * member has to be declared public to enable aggregate initialization as for
+   * std::array. We mark it as private in the documentation.
+   * @private
+   */
+  T m_internal_implementation_private_member_data[N];
 public:
 
   typedef T &                                 reference ;
@@ -78,25 +84,32 @@ public:
   KOKKOS_INLINE_FUNCTION
   reference operator[]( const iType & i )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
-      return m_elem[i];
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
     }
 
   template< typename iType >
   KOKKOS_INLINE_FUNCTION
   const_reference operator[]( const iType & i ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
-      return m_elem[i];
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
     }
 
-  KOKKOS_INLINE_FUNCTION pointer       data()       { return & m_elem[0] ; }
-  KOKKOS_INLINE_FUNCTION const_pointer data() const { return & m_elem[0] ; }
+  KOKKOS_INLINE_FUNCTION pointer       data()
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
 
-  ~Array() = default ;
-  Array() = default ;
-  Array( const Array & ) = default ;
-  Array & operator = ( const Array & ) = default ;
+  // Do not default unless move and move-assignment are also defined
+  // ~Array() = default ;
+  // Array() = default ;
+  // Array( const Array & ) = default ;
+  // Array & operator = ( const Array & ) = default ;
 
   // Some supported compilers are not sufficiently C++11 compliant
   // for default move constructor and move assignment operator.
@@ -124,7 +137,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   value_type operator[]( const iType & )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
       return value_type();
     }
 
@@ -132,7 +145,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   value_type operator[]( const iType & ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
       return value_type();
     }
 
@@ -181,7 +194,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   reference operator[]( const iType & i )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i];
     }
 
@@ -189,7 +202,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   const_reference operator[]( const iType & i ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i];
     }
 
@@ -250,7 +263,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   reference operator[]( const iType & i )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i*m_stride];
     }
 
@@ -258,7 +271,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   const_reference operator[]( const iType & i ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i*m_stride];
     }
 
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
index 3f9bdea40da551332852448b3b7fb68952bd1875..cfcdabf95e3e085cf388f14e99fb6b4db3d8c654 100644
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -102,6 +102,7 @@ KOKKOS_IMPL_IS_CONCEPT( memory_traits )
 KOKKOS_IMPL_IS_CONCEPT( execution_space )
 KOKKOS_IMPL_IS_CONCEPT( execution_policy )
 KOKKOS_IMPL_IS_CONCEPT( array_layout )
+KOKKOS_IMPL_IS_CONCEPT( reducer )
 
 namespace Impl {
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index 6d92f4bf616a057bb83cc34d38ab872e77281608..16c1bce902d47f38a1cd455df8f8900d3e73c0a5 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -57,6 +57,10 @@
 #include <Kokkos_OpenMP.hpp>
 #endif
 
+#if defined( KOKKOS_ENABLE_QTHREADS )
+#include <Kokkos_Qthreads.hpp>
+#endif
+
 #if defined( KOKKOS_ENABLE_PTHREAD )
 #include <Kokkos_Threads.hpp>
 #endif
@@ -76,6 +80,7 @@
 
 #include <Kokkos_Complex.hpp>
 
+#include <iosfwd>
 
 //----------------------------------------------------------------------------
 
@@ -105,6 +110,9 @@ void finalize_all();
 
 void fence();
 
+/** \brief Print "Bill of Materials" */
+void print_configuration( std::ostream & , const bool detail = false );
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -159,4 +167,3 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 //----------------------------------------------------------------------------
 
 #endif
-
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index e7e6a49d379045b2da38c7b53fdde589a989adec..4029bf599c6b564a8bc6bb2b6d20f9472fe19be5 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -63,7 +63,7 @@ namespace Kokkos {
 
 struct AUTO_t {
   KOKKOS_INLINE_FUNCTION
-  constexpr const AUTO_t & operator()() const { return *this ; }
+  constexpr const AUTO_t & operator()() const { return *this; }
 };
 
 namespace {
@@ -73,46 +73,49 @@ constexpr AUTO_t AUTO = Kokkos::AUTO_t();
 
 struct InvalidType {};
 
-}
+} // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Forward declarations for class inter-relationships
 
 namespace Kokkos {
 
-class HostSpace ; ///< Memory space for main process and CPU execution spaces
+class HostSpace; ///< Memory space for main process and CPU execution spaces
 
 #ifdef KOKKOS_ENABLE_HBWSPACE
 namespace Experimental {
-class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
+class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
 }
 #endif
 
 #if defined( KOKKOS_ENABLE_SERIAL )
-class Serial ;    ///< Execution space main process on CPU
-#endif // defined( KOKKOS_ENABLE_SERIAL )
+class Serial;    ///< Execution space main process on CPU.
+#endif
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+class Qthreads;  ///< Execution space with Qthreads back-end.
+#endif
 
 #if defined( KOKKOS_ENABLE_PTHREAD )
-class Threads ;  ///< Execution space with pthreads back-end
+class Threads;   ///< Execution space with pthreads back-end.
 #endif
 
 #if defined( KOKKOS_ENABLE_OPENMP )
-class OpenMP ; ///< OpenMP execution space
+class OpenMP;    ///< OpenMP execution space.
 #endif
 
 #if defined( KOKKOS_ENABLE_CUDA )
-class CudaSpace ;            ///< Memory space on Cuda GPU
-class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
-class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
-class Cuda ;                 ///< Execution space for Cuda GPU
+class CudaSpace;            ///< Memory space on Cuda GPU
+class CudaUVMSpace;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda;                 ///< Execution space for Cuda GPU
 #endif
 
 template<class ExecutionSpace, class MemorySpace>
 struct Device;
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Set the default execution space.
 
@@ -122,60 +125,66 @@ struct Device;
 
 namespace Kokkos {
 
-#if   defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-  typedef Cuda DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultExecutionSpace ;
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif
 
-#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_OPENMP )
-  typedef OpenMP DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_PTHREAD )
-  typedef Threads DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_SERIAL )
-  typedef Serial DefaultHostExecutionSpace ;
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_PTHREAD )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif
 
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Detect the active execution space and define its memory space.
 // This is used to verify whether a running kernel can access
 // a given memory space.
 
 namespace Kokkos {
+
 namespace Impl {
 
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_ENABLE_CUDA)
-typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace;
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace;
 #else
-typedef void ActiveExecutionMemorySpace ;
+typedef void ActiveExecutionMemorySpace;
 #endif
 
-template< class ActiveSpace , class MemorySpace >
+template< class ActiveSpace, class MemorySpace >
 struct VerifyExecutionCanAccessMemorySpace {
   enum {value = 0};
 };
 
 template< class Space >
-struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+struct VerifyExecutionCanAccessMemorySpace< Space, Space >
 {
   enum {value = 1};
   KOKKOS_INLINE_FUNCTION static void verify(void) {}
@@ -183,33 +192,33 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
-#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE, DATA_PTR ) \
   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify( DATA_PTR )
 
 #define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify()
 
 //----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
 
 namespace Kokkos {
   void fence();
 }
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
 template< class Functor
         , class Policy
         , class EnableFunctor = void
-	      , class EnablePolicy = void
+        , class EnablePolicy = void
         >
 struct FunctorPolicyExecutionSpace;
 
@@ -220,18 +229,18 @@ struct FunctorPolicyExecutionSpace;
 ///
 /// This is an implementation detail of parallel_for.  Users should
 /// skip this and go directly to the nonmember function parallel_for.
-template< class FunctorType , class ExecPolicy , class ExecutionSpace =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelFor ;
+template< class FunctorType, class ExecPolicy, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelFor;
 
 /// \class ParallelReduce
 /// \brief Implementation detail of parallel_reduce.
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelReduce ;
+template< class FunctorType, class ExecPolicy, class ReducerType = InvalidType, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelReduce;
 
 /// \class ParallelScan
 /// \brief Implementation detail of parallel_scan.
@@ -239,10 +248,12 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
 /// This is an implementation detail of parallel_scan.  Users should
 /// skip this and go directly to the documentation of the nonmember
 /// template function Kokkos::parallel_scan.
-template< class FunctorType , class ExecPolicy , class ExecutionSapce =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelScan ;
+template< class FunctorType, class ExecPolicy, class ExecutionSapce =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelScan;
 
-}}
-#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+} // namespace Impl
+
+} // namespace Kokkos
 
+#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
index afccdb6c5246b8a9778346d2db9065eb68ab7db0..433cac5e518cfbb40a413e1b5984994d54bfacbd 100644
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -62,7 +62,6 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
-#include <KokkosExp_MDRangePolicy.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -295,6 +294,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
 //----------------------------------------------------------------------------
 
 #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index d6bf8dcdf4520224fe238ec7eb3cc90754bd3838..fc39ce0e5bc04c4a9f2c6ee91580dbc43a45d8ef 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -44,14 +44,16 @@
 #ifndef KOKKOS_HBWSPACE_HPP
 #define KOKKOS_HBWSPACE_HPP
 
-
 #include <Kokkos_HostSpace.hpp>
 
 /*--------------------------------------------------------------------------*/
+
 #ifdef KOKKOS_ENABLE_HBWSPACE
 
 namespace Kokkos {
+
 namespace Experimental {
+
 namespace Impl {
 
 /// \brief Initialize lock array for arbitrary size atomics.
@@ -67,7 +69,7 @@ void init_lock_array_hbw_space();
 /// This function tries to aquire the lock for the hash value derived
 /// from the provided ptr. If the lock is successfully aquired the
 /// function returns true. Otherwise it returns false.
-bool lock_address_hbw_space(void* ptr);
+bool lock_address_hbw_space( void* ptr );
 
 /// \brief Release lock for the address
 ///
@@ -75,13 +77,16 @@ bool lock_address_hbw_space(void* ptr);
 /// from the provided ptr. This function should only be called
 /// after previously successfully aquiring a lock with
 /// lock_address.
-void unlock_address_hbw_space(void* ptr);
+void unlock_address_hbw_space( void* ptr );
 
 } // namespace Impl
-} // neamspace Experimental
+
+} // namespace Experimental
+
 } // namespace Kokkos
 
 namespace Kokkos {
+
 namespace Experimental {
 
 /// \class HBWSpace
@@ -91,10 +96,9 @@ namespace Experimental {
 /// memory means the usual CPU-accessible memory.
 class HBWSpace {
 public:
-
   //! Tag this class as a kokkos memory space
-  typedef HBWSpace  memory_space ;
-  typedef size_t     size_type ;
+  typedef HBWSpace  memory_space;
+  typedef size_t     size_type;
 
   /// \typedef execution_space
   /// \brief Default execution space for this memory space.
@@ -103,21 +107,25 @@ public:
   /// useful for things like initializing a View (which happens in
   /// parallel using the View's default execution space).
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
+  typedef Kokkos::Serial    execution_space;
 #else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qhreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
 #endif
 
   //! This memory space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
 
   /*--------------------------------*/
   /* Functions unique to the HBWSpace */
@@ -129,72 +137,73 @@ public:
 
   /**\brief  Default memory space instance */
   HBWSpace();
-  HBWSpace( const HBWSpace & rhs ) = default ;
-  HBWSpace & operator = ( const HBWSpace & ) = default ;
-  ~HBWSpace() = default ;
+  HBWSpace( const HBWSpace & rhs ) = default;
+  HBWSpace & operator = ( const HBWSpace & ) = default;
+  ~HBWSpace() = default;
 
   /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
 
-  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
 
   explicit
   HBWSpace( const AllocationMechanism & );
 
   /**\brief  Allocate untracked memory in the space */
-  void * allocate( const size_t arg_alloc_size ) const ;
+  void * allocate( const size_t arg_alloc_size ) const;
 
   /**\brief  Deallocate untracked memory in the space */
-  void deallocate( void * const arg_alloc_ptr 
-                 , const size_t arg_alloc_size ) const ;
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name();
 
 private:
 
-  AllocationMechanism  m_alloc_mech ;
+  AllocationMechanism  m_alloc_mech;
   static constexpr const char* m_name = "HBW";
-  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };
 
 } // namespace Experimental
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
-  : public SharedAllocationRecord< void , void >
+class SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >
+  : public SharedAllocationRecord< void, void >
 {
 private:
 
-  friend Kokkos::Experimental::HBWSpace ;
+  friend Kokkos::Experimental::HBWSpace;
 
-  typedef SharedAllocationRecord< void , void >  RecordBase ;
+  typedef SharedAllocationRecord< void, void >  RecordBase;
 
-  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
-  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
 
   static void deallocate( RecordBase * );
 
   /**\brief  Root record for tracked allocations from this HBWSpace instance */
-  static RecordBase s_root_record ;
+  static RecordBase s_root_record;
 
-  const Kokkos::Experimental::HBWSpace m_space ;
+  const Kokkos::Experimental::HBWSpace m_space;
 
 protected:
 
   ~SharedAllocationRecord();
-  SharedAllocationRecord() = default ;
+  SharedAllocationRecord() = default;
 
-  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace        & arg_space
-                        , const std::string              & arg_label
-                        , const size_t                     arg_alloc_size
-                        , const RecordBase::function_type  arg_dealloc = & deallocate
+  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
+                        , const std::string                    & arg_label
+                        , const size_t                           arg_alloc_size
+                        , const RecordBase::function_type        arg_dealloc = & deallocate
                         );
 
 public:
@@ -206,23 +215,23 @@ public:
     }
 
   KOKKOS_INLINE_FUNCTION static
-  SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace &  arg_space
-                                   , const std::string       &  arg_label
-                                   , const size_t               arg_alloc_size
+  SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace & arg_space
+                                   , const std::string                    & arg_label
+                                   , const size_t                           arg_alloc_size
                                    )
     {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+      return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
 #else
-      return (SharedAllocationRecord *) 0 ;
+      return (SharedAllocationRecord *) 0;
 #endif
     }
 
   /**\brief  Allocate tracked memory in the space */
   static
   void * allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
-                         , const std::string & arg_label
-                         , const size_t arg_alloc_size );
+                         , const std::string                    & arg_label
+                         , const size_t                           arg_alloc_size );
 
   /**\brief  Reallocate tracked memory in the space */
   static
@@ -233,88 +242,93 @@ public:
   static
   void deallocate_tracked( void * const arg_alloc_ptr );
 
-
   static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
 
-  static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
+  static void print_records( std::ostream &, const Kokkos::Experimental::HBWSpace &, bool detail = false );
 };
 
 } // namespace Impl
-} // namespace Kokkos
 
+} // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
-static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace >::assignable, "" );
 
 template<>
-struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
+struct MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace > {
   enum { assignable = true };
   enum { accessible = true };
   enum { deepcopy   = true };
 };
 
 template<>
-struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
+struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace > {
   enum { assignable = false };
   enum { accessible = true };
   enum { deepcopy   = true };
 };
 
-}}
+} // namespace Impl
+
+} // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Impl {
 
+namespace Impl {
 
-template<class ExecutionSpace>
-struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
-template<class ExecutionSpace>
-struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
-template<class ExecutionSpace>
-struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace >
 {
   enum { value = true };
   inline static void verify( void ) { }
@@ -322,7 +336,7 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experime
 };
 
 template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace >
 {
   enum { value = true };
   inline static void verify( void ) { }
@@ -330,8 +344,9 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kok
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 #endif
-#endif /* #define KOKKOS_HBWSPACE_HPP */
 
+#endif // #define KOKKOS_HBWSPACE_HPP
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
index e79de462bfe354fe5f7eb77100cdcc4e7aca2aef..82006665ce0a6a4ba37ae88ad8e7456d4c75101a 100644
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -60,6 +60,7 @@
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+
 namespace Impl {
 
 /// \brief Initialize lock array for arbitrary size atomics.
@@ -83,9 +84,10 @@ bool lock_address_host_space(void* ptr);
 /// from the provided ptr. This function should only be called
 /// after previously successfully aquiring a lock with
 /// lock_address.
-void unlock_address_host_space(void* ptr);
+void unlock_address_host_space( void* ptr );
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 namespace Kokkos {
@@ -97,10 +99,9 @@ namespace Kokkos {
 /// memory means the usual CPU-accessible memory.
 class HostSpace {
 public:
-
   //! Tag this class as a kokkos memory space
-  typedef HostSpace  memory_space ;
-  typedef size_t     size_type ;
+  typedef HostSpace  memory_space;
+  typedef size_t     size_type;
 
   /// \typedef execution_space
   /// \brief Default execution space for this memory space.
@@ -109,21 +110,25 @@ public:
   /// useful for things like initializing a View (which happens in
   /// parallel using the View's default execution space).
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
+  typedef Kokkos::Serial    execution_space;
 #else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
 #endif
 
   //! This memory space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
 
   /*--------------------------------*/
   /* Functions unique to the HostSpace */
@@ -135,61 +140,57 @@ public:
 
   /**\brief  Default memory space instance */
   HostSpace();
-  HostSpace( HostSpace && rhs ) = default ;
-  HostSpace( const HostSpace & rhs ) = default ;
-  HostSpace & operator = ( HostSpace && ) = default ;
-  HostSpace & operator = ( const HostSpace & ) = default ;
-  ~HostSpace() = default ;
+  HostSpace( HostSpace && rhs ) = default;
+  HostSpace( const HostSpace & rhs ) = default;
+  HostSpace & operator = ( HostSpace && ) = default;
+  HostSpace & operator = ( const HostSpace & ) = default;
+  ~HostSpace() = default;
 
   /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
 
-  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
 
   explicit
   HostSpace( const AllocationMechanism & );
 
   /**\brief  Allocate untracked memory in the space */
-  void * allocate( const size_t arg_alloc_size ) const ;
+  void * allocate( const size_t arg_alloc_size ) const;
 
   /**\brief  Deallocate untracked memory in the space */
-  void deallocate( void * const arg_alloc_ptr 
-                 , const size_t arg_alloc_size ) const ;
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name();
 
 private:
-
-  AllocationMechanism  m_alloc_mech ;
+  AllocationMechanism  m_alloc_mech;
   static constexpr const char* m_name = "Host";
-  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace, void >;
 };
 
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Impl {
 
-static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
+namespace Impl {
 
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
 
 template< typename S >
 struct HostMirror {
 private:
-
   // If input execution space can access HostSpace then keep it.
   // Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot
   enum { keep_exe = Kokkos::Impl::MemorySpaceAccess
-    < typename S::execution_space::memory_space , Kokkos::HostSpace >
-      ::accessible };
+                      < typename S::execution_space::memory_space, Kokkos::HostSpace >::accessible };
 
   // If HostSpace can access memory space then keep it.
   // Example:  Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace
   enum { keep_mem = Kokkos::Impl::MemorySpaceAccess
-    < Kokkos::HostSpace , typename S::memory_space >::accessible };
+                      < Kokkos::HostSpace, typename S::memory_space >::accessible };
 
 public:
 
@@ -202,42 +203,41 @@ public:
                         , typename S::memory_space >
         , Kokkos::HostSpace
         >::type
-    >::type  Space ;
+    >::type  Space;
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-class SharedAllocationRecord< Kokkos::HostSpace , void >
-  : public SharedAllocationRecord< void , void >
+class SharedAllocationRecord< Kokkos::HostSpace, void >
+  : public SharedAllocationRecord< void, void >
 {
 private:
+  friend Kokkos::HostSpace;
 
-  friend Kokkos::HostSpace ;
-
-  typedef SharedAllocationRecord< void , void >  RecordBase ;
+  typedef SharedAllocationRecord< void, void >  RecordBase;
 
-  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
-  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
 
   static void deallocate( RecordBase * );
 
   /**\brief  Root record for tracked allocations from this HostSpace instance */
-  static RecordBase s_root_record ;
+  static RecordBase s_root_record;
 
-  const Kokkos::HostSpace m_space ;
+  const Kokkos::HostSpace m_space;
 
 protected:
-
   ~SharedAllocationRecord();
-  SharedAllocationRecord() = default ;
+  SharedAllocationRecord() = default;
 
   SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
                         , const std::string              & arg_label
@@ -249,22 +249,23 @@ public:
 
   inline
   std::string get_label() const
-    {
-      return std::string( RecordBase::head()->m_label );
-    }
+  {
+    return std::string( RecordBase::head()->m_label );
+  }
 
   KOKKOS_INLINE_FUNCTION static
   SharedAllocationRecord * allocate( const Kokkos::HostSpace &  arg_space
                                    , const std::string       &  arg_label
                                    , const size_t               arg_alloc_size
                                    )
-    {
+  {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+    return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
 #else
-      return (SharedAllocationRecord *) 0 ;
+    return (SharedAllocationRecord *) 0;
 #endif
-    }
+  }
+   
 
   /**\brief  Allocate tracked memory in the space */
   static
@@ -281,37 +282,37 @@ public:
   static
   void deallocate_tracked( void * const arg_alloc_ptr );
 
-
   static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
 
-  static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
+  static void print_records( std::ostream &, const Kokkos::HostSpace &, bool detail = false );
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
-template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space> struct DeepCopy ;
+template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space > struct DeepCopy;
 
-template<class ExecutionSpace>
-struct DeepCopy<HostSpace,HostSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
 } // namespace Impl
-} // namespace Kokkos
-
 
-#endif /* #define KOKKOS_HOSTSPACE_HPP */
+} // namespace Kokkos
 
+#endif // #define KOKKOS_HOSTSPACE_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 52845b9e093bcc6cd363b144ac59df0bda8bb124..c138b08c94a5a9f93e7faeb067283a221486cb4a 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -45,22 +45,20 @@
 #define KOKKOS_MACROS_HPP
 
 //----------------------------------------------------------------------------
-/** Pick up configure/build options via #define macros:
+/** Pick up configure / build options via #define macros:
  *
  *  KOKKOS_ENABLE_CUDA                Kokkos::Cuda execution and memory spaces
  *  KOKKOS_ENABLE_PTHREAD             Kokkos::Threads execution space
- *  KOKKOS_ENABLE_QTHREAD             Kokkos::Qthread execution space
- *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP  execution space
- *  KOKKOS_ENABLE_HWLOC               HWLOC library is available
- *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK    insert array bounds checks, is expensive!
- *
- *  KOKKOS_ENABLE_MPI                 negotiate MPI/execution space interactions
- *
- *  KOKKOS_ENABLE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ *  KOKKOS_ENABLE_QTHREADS            Kokkos::Qthreads execution space
+ *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
+ *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space interactions.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
  */
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
-#include <KokkosCore_config.h>
+  #include <KokkosCore_config.h>
 #endif
 
 #include <impl/Kokkos_OldMacros.hpp>
@@ -86,7 +84,7 @@
  *  KOKKOS_ENABLE_INTEL_ATOMICS
  *  KOKKOS_ENABLE_OPENMP_ATOMICS
  *
- *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *  A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
  *
  *  Macros for marking functions to run in an execution space:
  *
@@ -98,64 +96,63 @@
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
+  // Compiling with a CUDA compiler.
+  //
+  //  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+  //    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+  //
+  //  When generating device code the __CUDA_ARCH__ macro is defined as:
+  //    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+
+  #include <cuda_runtime.h>
+  #include <cuda.h>
+
+  #if !defined( CUDA_VERSION )
+    #error "#include <cuda.h> did not define CUDA_VERSION."
+  #endif
 
-/*  Compiling with a CUDA compiler.
- *
- *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
- *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
- *
- *  When generating device code the __CUDA_ARCH__ macro is defined as:
- *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
- */
+  #if ( CUDA_VERSION < 7000 )
+    // CUDA supports C++11 in device code starting with version 7.0.
+    // This includes auto type and device code internal lambdas.
+    #error "Cuda version 7.0 or greater required."
+  #endif
 
-#include <cuda_runtime.h>
-#include <cuda.h>
+  #if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+    // Compiling with CUDA compiler for device code.
+    #error "Cuda device capability >= 3.0 is required."
+  #endif
 
-#if ! defined( CUDA_VERSION )
-#error "#include <cuda.h> did not define CUDA_VERSION"
-#endif
+  #ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+    #if ( CUDA_VERSION < 7050 )
+      // CUDA supports C++11 lambdas generated in host code to be given
+      // to the device starting with version 7.5. But the release candidate (7.5.6)
+      // still identifies as 7.0.
+      #error "Cuda version 7.5 or greater required for host-to-device Lambda support."
+    #endif
 
-#if ( CUDA_VERSION < 7000 )
-// CUDA supports C++11 in device code starting with
-// version 7.0. This includes auto type and device code internal
-// lambdas.
-#error "Cuda version 7.0 or greater required"
-#endif
+    #if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
+      #define KOKKOS_LAMBDA [=]__device__
+    #else
+      #define KOKKOS_LAMBDA [=]__host__ __device__
 
-#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
-/*  Compiling with CUDA compiler for device code. */
-#error "Cuda device capability >= 3.0 is required"
-#endif
+      #if defined( KOKKOS_ENABLE_CXX1Z )
+        #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
+      #endif
+    #endif
 
-#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
-#if ( CUDA_VERSION < 7050 )
-  // CUDA supports C++11 lambdas generated in host code to be given
-  // to the device starting with version 7.5. But the release candidate (7.5.6)
-  // still identifies as 7.0
-  #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
-#endif
-#if ( CUDA_VERSION < 8000 ) && defined(__NVCC__)
-  #define KOKKOS_LAMBDA [=]__device__
-#else
-  #define KOKKOS_LAMBDA [=]__host__ __device__
-  #if defined( KOKKOS_ENABLE_CXX1Z )
-    #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
+    #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
   #endif
-#endif
-#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
-#endif
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ ) */
+#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
 
-
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
    // Cuda version 8.0 still needs the functor wrapper
-   #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) && defined(__NVCC__)
+   #if /* ( CUDA_VERSION < 8000 ) && */  defined( __NVCC__ )
       #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
    #endif
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* Language info: C++, CUDA, OPENMP */
+//----------------------------------------------------------------------------
+// Language info: C++, CUDA, OPENMP
 
 #if defined( KOKKOS_ENABLE_CUDA )
   // Compiling Cuda code to 'ptx'
@@ -163,20 +160,17 @@
   #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
   #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
   #define KOKKOS_FUNCTION              __device__  __host__
-#endif /* #if defined( __CUDA_ARCH__ ) */
+#endif // #if defined( __CUDA_ARCH__ )
 
 #if defined( _OPENMP )
+  //  Compiling with OpenMP.
+  //  The value of _OPENMP is an integer value YYYYMM
+  //  where YYYY and MM are the year and month designation
+  //  of the supported OpenMP API version.
+#endif // #if defined( _OPENMP )
 
-  /*  Compiling with OpenMP.
-   *  The value of _OPENMP is an integer value YYYYMM
-   *  where YYYY and MM are the year and month designation
-   *  of the supported OpenMP API version.
-   */
-
-#endif /* #if defined( _OPENMP ) */
-
-/*--------------------------------------------------------------------------*/
-/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+//----------------------------------------------------------------------------
+// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros
 
 #if defined( __NVCC__ )
   // NVIDIA compiler is being used.
@@ -184,29 +178,28 @@
   // Host code is compiled again with another compiler.
   // Device code is compile to 'ptx'.
   #define KOKKOS_COMPILER_NVCC __NVCC__
-
 #else
-#if ! defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-    #if !defined (KOKKOS_ENABLE_CUDA) // Compiling with clang for Cuda does not work with LAMBDAs either
-    // CUDA (including version 6.5) does not support giving lambdas as
-    // arguments to global functions. Thus its not currently possible
-    // to dispatch lambdas from the host.
-    #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
+  #if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+    #if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
+      // CUDA (including version 6.5) does not support giving lambdas as
+      // arguments to global functions. Thus its not currently possible
+      // to dispatch lambdas from the host.
+      #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
     #endif
   #endif
-#endif /* #if defined( __NVCC__ ) */
+#endif // #if defined( __NVCC__ )
 
-#if !defined (KOKKOS_LAMBDA)
+#if !defined( KOKKOS_LAMBDA )
   #define KOKKOS_LAMBDA [=]
 #endif
 
-#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined (KOKKOS_CLASS_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
   #define KOKKOS_CLASS_LAMBDA [=,*this]
 #endif
 
-//#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'.
 
-/* Intel compiler for host code */
+// Intel compiler for host code.
 
 #if defined( __INTEL_COMPILER )
   #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
@@ -218,7 +211,7 @@
   #define KOKKOS_COMPILER_INTEL __ECC
 #endif
 
-/* CRAY compiler for host code */
+// CRAY compiler for host code
 #if defined( _CRAYC )
   #define KOKKOS_COMPILER_CRAYC _CRAYC
 #endif
@@ -234,50 +227,53 @@
   #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
 #endif
 
-#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
+#if defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL )
   #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
 #endif
 
-#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+#if !defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
   #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+
   #if ( 472 > KOKKOS_COMPILER_GNU )
     #error "Compiling with GCC version earlier than 4.7.2 is not supported."
   #endif
 #endif
 
-#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+#if defined( __PGIC__ ) && !defined( __GNUC__ )
   #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+
   #if ( 1540 > KOKKOS_COMPILER_PGI )
     #error "Compiling with PGI version earlier than 15.4 is not supported."
   #endif
 #endif
 
-//#endif /* #if ! defined( __CUDA_ARCH__ ) */
+//#endif // #if !defined( __CUDA_ARCH__ )
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Intel compiler macros */
+//----------------------------------------------------------------------------
+// Intel compiler macros
 
 #if defined( KOKKOS_COMPILER_INTEL )
-
   #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
-  #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
+  #if ( __INTEL_COMPILER > 1400 )
+    #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  #endif
+
   #define KOKKOS_RESTRICT __restrict__
 
   #ifndef KOKKOS_ALIGN
-  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+    #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
   #endif
 
   #ifndef KOKKOS_ALIGN_PTR
-  #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
+    #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
   #endif
 
   #ifndef KOKKOS_ALIGN_SIZE
-  #define KOKKOS_ALIGN_SIZE 64
+    #define KOKKOS_ALIGN_SIZE 64
   #endif
 
   #if ( 1400 > KOKKOS_COMPILER_INTEL )
@@ -287,12 +283,13 @@
       #warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
     #endif
   #endif
-  #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
+
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( _WIN32 )
     #define KOKKOS_ENABLE_ASM 1
   #endif
 
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-    #if !defined (_WIN32)
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined( _WIN32 )
       #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
     #else
       #define KOKKOS_FORCEINLINE_FUNCTION inline
@@ -302,192 +299,170 @@
   #if defined( __MIC__ )
     // Compiling for Xeon Phi
   #endif
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* Cray compiler macros */
+//----------------------------------------------------------------------------
+// Cray compiler macros
 
 #if defined( KOKKOS_COMPILER_CRAYC )
-
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* IBM Compiler macros */
+//----------------------------------------------------------------------------
+// IBM Compiler macros
 
 #if defined( KOKKOS_COMPILER_IBM )
-
   #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* CLANG compiler macros */
+//----------------------------------------------------------------------------
+// CLANG compiler macros
 
 #if defined( KOKKOS_COMPILER_CLANG )
-
   //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
     #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
   #endif
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* GNU Compiler macros */
+//----------------------------------------------------------------------------
+// GNU Compiler macros
 
 #if defined( KOKKOS_COMPILER_GNU )
-
   //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
     #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
   #endif
 
-  #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( __PGIC__ ) && \
-      ( defined( __amd64 ) || \
-        defined( __amd64__ ) || \
-        defined( __x86_64 ) || \
-        defined( __x86_64__ ) )
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
+      ( defined( __amd64 ) || defined( __amd64__ ) || \
+        defined( __x86_64 ) || defined( __x86_64__ ) )
     #define KOKKOS_ENABLE_ASM 1
   #endif
-
 #endif
 
-/*--------------------------------------------------------------------------*/
+//----------------------------------------------------------------------------
 
 #if defined( KOKKOS_COMPILER_PGI )
-
   #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
-
 #endif
 
-/*--------------------------------------------------------------------------*/
+//----------------------------------------------------------------------------
 
 #if defined( KOKKOS_COMPILER_NVCC )
-
-  #if defined(__CUDA_ARCH__ )
+  #if defined( __CUDA_ARCH__ )
     #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   #endif
-
 #endif
 
 //----------------------------------------------------------------------------
-/** Define function marking macros if compiler specific macros are undefined: */
+// Define function marking macros if compiler specific macros are undefined:
 
-#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #define KOKKOS_FORCEINLINE_FUNCTION  inline
 #endif
 
-#if ! defined( KOKKOS_INLINE_FUNCTION )
-#define KOKKOS_INLINE_FUNCTION  inline
+#if !defined( KOKKOS_INLINE_FUNCTION )
+  #define KOKKOS_INLINE_FUNCTION  inline
 #endif
 
-#if ! defined( KOKKOS_FUNCTION )
-#define KOKKOS_FUNCTION /**/
+#if !defined( KOKKOS_FUNCTION )
+  #define KOKKOS_FUNCTION /**/
 #endif
 
-
 //----------------------------------------------------------------------------
-///** Define empty macro for restrict if necessary: */
+// Define empty macro for restrict if necessary:
 
-#if ! defined(KOKKOS_RESTRICT)
-#define KOKKOS_RESTRICT
+#if !defined( KOKKOS_RESTRICT )
+  #define KOKKOS_RESTRICT
 #endif
 
 //----------------------------------------------------------------------------
-/** Define Macro for alignment: */
-#if ! defined KOKKOS_ALIGN_SIZE
-#define KOKKOS_ALIGN_SIZE 16
-#endif
+// Define Macro for alignment:
 
-#if ! defined(KOKKOS_ALIGN)
-#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+#if !defined KOKKOS_ALIGN_SIZE
+  #define KOKKOS_ALIGN_SIZE 16
 #endif
 
-#if ! defined(KOKKOS_ALIGN_PTR)
-#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
+#if !defined( KOKKOS_ALIGN )
+  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
 #endif
 
-//----------------------------------------------------------------------------
-/** Determine the default execution space for parallel dispatch.
- *  There is zero or one default execution space specified.
- */
-
-#if 1 < ( ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
-
-#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
-
+#if !defined( KOKKOS_ALIGN_PTR )
+  #define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
 #endif
 
-/** If default is not specified then chose from enabled execution spaces.
- *  Priority: CUDA, OPENMP, THREADS, SERIAL
- */
-#if   defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-#elif defined ( KOKKOS_ENABLE_CUDA )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
-#elif defined ( KOKKOS_ENABLE_OPENMP )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
-#elif defined ( KOKKOS_ENABLE_PTHREAD )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+//----------------------------------------------------------------------------
+// Determine the default execution space for parallel dispatch.
+// There is zero or one default execution space specified.
+
+#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+  #error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified."
+#endif
+
+// If default is not specified then chose from enabled execution spaces.
+// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined( KOKKOS_ENABLE_PTHREAD )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
 #else
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
 #endif
 
 //----------------------------------------------------------------------------
-/** Determine for what space the code is being compiled: */
+// Determine for what space the code is being compiled:
 
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_ENABLE_CUDA)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
 #else
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 #endif
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 #if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
     ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
-#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
-#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
-#endif
+  #if defined( KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN )
+    #define KOKKOS_ENABLE_POSIX_MEMALIGN 1
+  #endif
 #endif
 
 //----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/**Enable Profiling by default**/
+// Enable Profiling by default
 
 #ifndef KOKKOS_ENABLE_PROFILING
-#define KOKKOS_ENABLE_PROFILING 1
+  #define KOKKOS_ENABLE_PROFILING 1
 #endif
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_MACROS_HPP */
-
+#endif // #ifndef KOKKOS_MACROS_HPP
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index 2d45926e762acd61ba7f308a80c2d7f922267ffe..eadad10b4991db1e98410f8eafcd77ad9bc87db0 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -1294,6 +1294,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
 
+  KOKKOS_INLINE_FUNCTION
   size_t get_mem_size() const { return m_data_size; }
 
 private:
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
index a337d1a9d4a02fcdaae38a6f402301d1a6a9ec03..c0c43b92f4d72f4fb6ae5ba95dc5270887f1cd32 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -66,7 +66,6 @@
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
-#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
@@ -196,6 +195,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 
 #endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp
index 83436826f4aded7131802662327d6b80c5b5c785..067767f2f83f1739fb3a40bd800300c2078c3b28 100644
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@@ -78,16 +78,14 @@ struct pair
   /// This calls the default constructors of T1 and T2.  It won't
   /// compile if those default constructors are not defined and
   /// public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first(), second()
-  {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair() = default ;
 
   /// \brief Constructor that takes both elements of the pair.
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type const& f, second_type const& s)
     : first(f), second(s)
   {}
@@ -97,7 +95,7 @@ struct pair
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -107,7 +105,7 @@ struct pair
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const volatile pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -183,7 +181,7 @@ struct pair<T1&, T2&>
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type f, second_type s)
     : first(f), second(s)
   {}
@@ -193,7 +191,7 @@ struct pair<T1&, T2&>
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -247,7 +245,7 @@ struct pair<T1, T2&>
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type const& f, second_type s)
     : first(f), second(s)
   {}
@@ -257,7 +255,7 @@ struct pair<T1, T2&>
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -311,7 +309,7 @@ struct pair<T1&, T2>
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type f, second_type const& s)
     : first(f), second(s)
   {}
@@ -321,7 +319,7 @@ struct pair<T1&, T2>
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -366,31 +364,31 @@ bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 
 //! Inequality operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(lhs==rhs); }
 
 //! Less-than operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
 
 //! Less-than-or-equal-to operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(rhs<lhs); }
 
 //! Greater-than operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return rhs<lhs; }
 
 //! Greater-than-or-equal-to operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(lhs<rhs); }
 
@@ -399,7 +397,7 @@ bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 /// This is a "nonmember constructor" for Kokkos::pair.  It works just
 /// like std::make_pair.
 template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 pair<T1,T2> make_pair (T1 x, T2 y)
 { return ( pair<T1,T2>(x,y) ); }
 
@@ -460,23 +458,21 @@ struct pair<T1,void>
   first_type  first;
   enum { second = 0 };
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first()
-  {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair() = default ;
 
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(const first_type & f)
     : first(f)
   {}
 
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(const first_type & f, int)
     : first(f)
   {}
 
   template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,void> &p)
     : first(p.first)
   {}
@@ -495,32 +491,32 @@ struct pair<T1,void>
 //
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return lhs.first==rhs.first; }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(lhs==rhs); }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return lhs.first<rhs.first; }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(rhs<lhs); }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return rhs<lhs; }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(lhs<rhs); }
 
@@ -528,3 +524,4 @@ bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 
 
 #endif //KOKKOS_PAIR_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
index 64b1502bcc1932338a16bfcb1604eb1887d85cce..e412e608b28ca52f7d7888ea5fc37af721c5b10c 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -52,13 +52,14 @@
 #include <Kokkos_View.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <typeinfo>
 #endif
 
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
 #ifdef KOKKOS_DEBUG
@@ -175,7 +176,7 @@ void parallel_for( const ExecPolicy  & policy
                  , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
                  )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
      	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@@ -185,10 +186,10 @@ void parallel_for( const ExecPolicy  & policy
     Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
     Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
     Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-   
+
    closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
      }
@@ -207,20 +208,20 @@ void parallel_for( const size_t        work_count
       execution_space ;
   typedef RangePolicy< execution_space > policy ;
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
   	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
-    
+
   Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
   Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
   Kokkos::Impl::shared_allocation_tracking_release_and_enable();
 
   closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelFor(kpID);
      }
@@ -417,7 +418,7 @@ void parallel_scan( const ExecutionPolicy & policy
                   , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
                   )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@@ -430,7 +431,7 @@ void parallel_scan( const ExecutionPolicy & policy
 
   closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelScan(kpID);
      }
@@ -450,20 +451,20 @@ void parallel_scan( const size_t        work_count
 
   typedef Kokkos::RangePolicy< execution_space > policy ;
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
-    
+
   Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
   Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
   Kokkos::Impl::shared_allocation_tracking_release_and_enable();
 
   closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelScan(kpID);
      }
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index a3649b4422dc7f581b38f2866f2bacb63b93b631..900dce19fe52b538228fbb2a82cb649f5313ec43 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -1094,7 +1094,7 @@ namespace Impl {
         const PolicyType& policy,
         const FunctorType& functor,
         ReturnType& return_value) {
-          #if (KOKKOS_ENABLE_PROFILING)
+          #if defined(KOKKOS_ENABLE_PROFILING)
             uint64_t kpID = 0;
             if(Kokkos::Profiling::profileLibraryLoaded()) {
               Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
@@ -1116,7 +1116,7 @@ namespace Impl {
           Kokkos::Impl::shared_allocation_tracking_release_and_enable();
           closure.execute();
 
-          #if (KOKKOS_ENABLE_PROFILING)
+          #if defined(KOKKOS_ENABLE_PROFILING)
             if(Kokkos::Profiling::profileLibraryLoaded()) {
               Kokkos::Profiling::endParallelReduce(kpID);
             }
diff --git a/lib/kokkos/core/src/Kokkos_Qthread.hpp b/lib/kokkos/core/src/Kokkos_Qthreads.hpp
similarity index 72%
rename from lib/kokkos/core/src/Kokkos_Qthread.hpp
rename to lib/kokkos/core/src/Kokkos_Qthreads.hpp
index c58518b0654bb3267a12041a2ab7fef4e2375972..0507552c3f95e7fb63527603c7123a19daee2b14 100644
--- a/lib/kokkos/core/src/Kokkos_Qthread.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthreads.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,57 +36,75 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-#ifndef KOKKOS_QTHREAD_HPP
-#define KOKKOS_QTHREAD_HPP
+#ifndef KOKKOS_QTHREADS_HPP
+#define KOKKOS_QTHREADS_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#ifdef KOKKOS_ENABLE_QTHREADS
+
+// Defines to enable experimental Qthreads functionality.
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
 
 #include <cstddef>
 #include <iosfwd>
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
+
 #include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+//#include <Kokkos_MemoryTraits.hpp>
+//#include <Kokkos_ExecPolicy.hpp>
+//#include <Kokkos_TaskScheduler.hpp> // Uncomment when Tasking working.
+#include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
 
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+
 namespace Impl {
-class QthreadExec ;
+
+class QthreadsExec;
+
 } // namespace Impl
+
 } // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
 
-/** \brief  Execution space supported by Qthread */
-class Qthread {
+/** \brief  Execution space supported by Qthreads */
+class Qthreads {
 public:
   //! \name Type declarations that all Kokkos devices must provide.
   //@{
 
   //! Tag this class as an execution space
-  typedef Qthread                  execution_space ;
-  typedef Kokkos::HostSpace        memory_space ;
+  typedef Qthreads                 execution_space;
+  typedef Kokkos::HostSpace        memory_space;
   //! This execution space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
 
-  typedef Kokkos::LayoutRight      array_layout ;
-  typedef memory_space::size_type  size_type ;
+  typedef Kokkos::LayoutRight      array_layout;
+  typedef memory_space::size_type  size_type;
 
-  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
+  typedef ScratchMemorySpace< Qthreads > scratch_memory_space;
 
   //@}
   /*------------------------------------------------------------------------*/
 
   /** \brief  Initialization will construct one or more instances */
-  static Qthread & instance( int = 0 );
+  static Qthreads & instance( int = 0 );
 
   /** \brief  Set the execution space to a "sleep" state.
    *
@@ -100,14 +118,14 @@ public:
   bool sleep();
 
   /** \brief  Wake from the sleep state.
-   * 
+   *
    *  \return True if enters or is in the "ready" state.
    *          False if functions are currently executing.
    */
   static bool wake();
 
   /** \brief Wait until all dispatched functions to complete.
-   * 
+   *
    *  The parallel_for or parallel_reduce dispatch of a functor may
    *  return asynchronously, before the functor completes.  This
    *  method does not return until all dispatched functors on this
@@ -128,26 +146,24 @@ public:
   static void finalize();
 
   /** \brief Print configuration information to the given output stream. */
-  static void print_configuration( std::ostream & , const bool detail = false );
+  static void print_configuration( std::ostream &, const bool detail = false );
 
-  int shepherd_size() const ;
-  int shepherd_worker_size() const ;
+  int shepherd_size() const;
+  int shepherd_worker_size() const;
 };
 
-/*--------------------------------------------------------------------------*/
-
 } // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-struct MemorySpaceAccess 
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
+struct MemorySpaceAccess
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
   >
 {
   enum { assignable = false };
@@ -157,27 +173,26 @@ struct MemorySpaceAccess
 
 template<>
 struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
   >
 {
   enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
+  inline static void verify( void ) {}
+  inline static void verify( const void * ) {}
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <Kokkos_Parallel.hpp>
-#include <Qthread/Kokkos_QthreadExec.hpp>
-#include <Qthread/Kokkos_Qthread_Parallel.hpp>
 
-#endif /* #define KOKKOS_QTHREAD_HPP */
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
+#include <Qthreads/Kokkos_Qthreads_Parallel.hpp>
+//#include <Qthreads/Kokkos_Qthreads_Task.hpp> // Uncomment when Tasking working.
+//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+#endif // #define KOKKOS_ENABLE_QTHREADS
 
+#endif // #define KOKKOS_QTHREADS_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index f26253591007774c6d1aeb70bce6210896fea56f..72710e81679863bfc3c5e680663cf0feda2b5868 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -56,6 +56,8 @@
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
@@ -138,30 +140,15 @@ public:
   static void initialize( unsigned threads_count = 1 ,
                           unsigned use_numa_count = 0 ,
                           unsigned use_cores_per_numa = 0 ,
-                          bool allow_asynchronous_threadpool = false) {
-    (void) threads_count;
-    (void) use_numa_count;
-    (void) use_cores_per_numa;
-    (void) allow_asynchronous_threadpool;
-
-    // Init the array of locks used for arbitrarily sized atomics
-    Impl::init_lock_array_host_space();
-    #if (KOKKOS_ENABLE_PROFILING)
-      Kokkos::Profiling::initialize();
-    #endif
-  }
+                          bool allow_asynchronous_threadpool = false);
 
-  static int is_initialized() { return 1 ; }
+  static int is_initialized();
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency() {return 1;};
 
   //! Free any resources being consumed by the device.
-  static void finalize() {
-    #if (KOKKOS_ENABLE_PROFILING)
-      Kokkos::Profiling::finalize();
-    #endif
-  }
+  static void finalize();
 
   //! Print configuration information to the given output stream.
   static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@@ -177,10 +164,6 @@ public:
   inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
 
   //--------------------------------------------------------------------------
-
-  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
-
-  //--------------------------------------------------------------------------
 };
 
 } // namespace Kokkos
@@ -192,7 +175,7 @@ namespace Kokkos {
 namespace Impl {
 
 template<>
-struct MemorySpaceAccess 
+struct MemorySpaceAccess
   < Kokkos::Serial::memory_space
   , Kokkos::Serial::scratch_memory_space
   >
@@ -213,22 +196,6 @@ struct VerifyExecutionCanAccessMemorySpace
   inline static void verify( const void * ) { }
 };
 
-namespace SerialImpl {
-
-struct Sentinel {
-
-  void *   m_scratch ;
-  unsigned m_reduce_end ;
-  unsigned m_shared_end ;
-
-  Sentinel();
-  ~Sentinel();
-  static Sentinel & singleton();
-};
-
-inline
-unsigned align( unsigned n );
-}
 } // namespace Impl
 } // namespace Kokkos
 
@@ -238,89 +205,26 @@ unsigned align( unsigned n );
 namespace Kokkos {
 namespace Impl {
 
-class SerialTeamMember {
-private:
-  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
-  const scratch_memory_space  m_space ;
-  const int                   m_league_rank ;
-  const int                   m_league_size ;
-
-  SerialTeamMember & operator = ( const SerialTeamMember & );
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_scratch(int) const
-    { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & thread_scratch(int) const
-    { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes );
 
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+HostThreadTeamData * serial_get_thread_team_data();
 
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(const ValueType& , const int& ) const {}
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION
-  ValueType team_reduce( const ValueType & value , const JoinOp & ) const
-    {
-      return value ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-    {
-      const Type tmp = global_accum ? *global_accum : Type(0) ;
-      if ( global_accum ) { *global_accum += value ; }
-      return tmp ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
-    { return Type(0); }
-
-  //----------------------------------------
-  // Execution space specific:
+} /* namespace Impl */
+} /* namespace Kokkos */
 
-  SerialTeamMember( int arg_league_rank
-                  , int arg_league_size
-                  , int arg_shared_size
-                  );
-};
 
-} // namespace Impl
+namespace Kokkos {
+namespace Impl {
 
 /*
  * < Kokkos::Serial , WorkArgTag >
  * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
  *
  */
-namespace Impl {
 template< class ... Properties >
 class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
 {
@@ -441,14 +345,11 @@ public:
     return p;
   };
 
-  typedef Impl::SerialTeamMember  member_type ;
+  typedef Impl::HostThreadTeamMember< Kokkos::Serial >  member_type ;
 };
 } /* namespace Impl */
 } /* namespace Kokkos */
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 /* Parallel patterns for Kokkos::Serial with RangePolicy */
@@ -521,11 +422,12 @@ private:
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -535,34 +437,25 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( i , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
       const TagType t{} ;
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( t , i , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
 public:
@@ -570,10 +463,29 @@ public:
   inline
   void execute() const
     {
-      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
 
-      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class HostViewType >
@@ -587,7 +499,7 @@ public:
     : m_functor( arg_functor )
     , m_policy( arg_policy )
     , m_reducer( InvalidType() )
-    , m_result_ptr( arg_result_view.ptr_on_device() )
+    , m_result_ptr( arg_result_view.data() )
     {
       static_assert( Kokkos::is_view< HostViewType >::value
         , "Kokkos::Serial reduce result must be a View" );
@@ -623,11 +535,13 @@ private:
 
   typedef Kokkos::RangePolicy< Traits ... > Policy ;
   typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -635,10 +549,8 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
-      reference_type update = ValueInit::init( m_functor , ptr );
-
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( i , update , true );
@@ -648,11 +560,9 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
       const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
-
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( t , i , update , true );
@@ -664,9 +574,22 @@ public:
   inline
   void execute() const
     {
-      pointer_type ptr = (pointer_type)
-        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
-      this-> template exec< WorkTag >( ptr );
+      const size_t pool_reduce_size = Analysis::value_size( m_functor );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      reference_type update =
+        ValueInit::init( m_functor , pointer_type(data.pool_reduce_local()) );
+
+      this-> template exec< WorkTag >( update );
     }
 
   inline
@@ -696,6 +619,8 @@ class ParallelFor< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
   typedef typename Policy::member_type                       Member ;
 
@@ -706,21 +631,21 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec() const
+  exec( HostThreadTeamData & data ) const
     {
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( Member(ileague,m_league,m_shared) );
+        m_functor( Member(data,ileague,m_league) );
       }
     }
 
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec() const
+  exec( HostThreadTeamData & data ) const
     {
       const TagType t{} ;
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( t , Member(ileague,m_league,m_shared) );
+        m_functor( t , Member(data,ileague,m_league) );
       }
     }
 
@@ -729,15 +654,28 @@ public:
   inline
   void execute() const
     {
-      Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
-      this-> template exec< typename Policy::work_tag >();
+      const size_t pool_reduce_size  = 0 ; // Never shrinks
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      this->template exec< typename Policy::work_tag >( data );
     }
 
   ParallelFor( const FunctorType & arg_functor
              , const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_league(  arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
     { }
 };
 
@@ -752,18 +690,22 @@ class ParallelReduce< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
   typedef typename Policy::member_type                       Member ;
   typedef typename Policy::work_tag                          WorkTag ;
 
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const int          m_league ;
@@ -774,33 +716,23 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( HostThreadTeamData & data , reference_type update ) const
     {
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( Member(ileague,m_league,m_shared) , update );
+        m_functor( Member(data,ileague,m_league) , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( HostThreadTeamData & data , reference_type update ) const
     {
       const TagType t{} ;
 
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( t , Member(ileague,m_league,m_shared) , update );
+        m_functor( t , Member(data,ileague,m_league) , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
 public:
@@ -808,10 +740,31 @@ public:
   inline
   void execute() const
     {
-      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
+      const size_t pool_reduce_size  =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
 
-      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( data , update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class ViewType >
@@ -825,8 +778,10 @@ public:
     : m_functor( arg_functor )
     , m_league( arg_policy.league_size() )
     , m_reducer( InvalidType() )
-    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    , m_result_ptr( arg_result.data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
     {
       static_assert( Kokkos::is_view< ViewType >::value
         , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@@ -838,13 +793,15 @@ public:
 
   inline
   ParallelReduce( const FunctorType & arg_functor
-    , Policy       arg_policy
-    , const ReducerType& reducer )
-  : m_functor( arg_functor )
-  , m_league(  arg_policy.league_size() )
-  , m_reducer( reducer )
-  , m_result_ptr(  reducer.result_view().data() )
-  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_league(  arg_policy.league_size() )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
   {
   /*static_assert( std::is_same< typename ViewType::memory_space
                           , Kokkos::HostSpace >::value
@@ -858,261 +815,6 @@ public:
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
-/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
-
-namespace Kokkos {
-namespace Impl {
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
-  typedef iType index_type;
-  const iType begin ;
-  const iType end ;
-  enum {increment = 1};
-  const SerialTeamMember& thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
-    : begin(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
-    : begin( arg_begin )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-  template<typename iType>
-  struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
-    typedef iType index_type;
-    enum {start = 0};
-    const iType end;
-    enum {increment = 1};
-
-    KOKKOS_INLINE_FUNCTION
-    ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
-      end( count )
-    {}
-  };
-
-} // namespace Impl
-
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
-TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, count );
-}
-
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::SerialTeamMember >
-TeamThreadRange( const Impl::SerialTeamMember& thread, const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, iType(begin), iType(end) );
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
-  ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-}
-
-//----------------------------------------------------------------------------
 
 #include <impl/Kokkos_Serial_Task.hpp>
 
diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
index e4271aa18814160f58fde909b619c78cc25761fa..e25039d236d68544cecf3dc968f853179e94a52d 100644
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -82,6 +82,15 @@ class Future ;
 template< typename Space >
 class TaskScheduler ;
 
+template< typename Space >
+void wait( TaskScheduler< Space > const & );
+
+template< typename Space >
+struct is_scheduler : public std::false_type {};
+
+template< typename Space >
+struct is_scheduler< TaskScheduler< Space > > : public std::true_type {};
+
 } // namespace Kokkos
 
 #include <impl/Kokkos_TaskQueue.hpp>
@@ -109,9 +118,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;
 
-template< typename Space >
-class TaskExec ;
-
 } // namespace Impl
 } // namespace Kokkos
 
@@ -312,6 +318,19 @@ public:
     }
 };
 
+// Is a Future with the given execution space
+template< typename , typename ExecSpace = void >
+struct is_future : public std::false_type {};
+
+template< typename Arg1 , typename Arg2 , typename ExecSpace >
+struct is_future< Future<Arg1,Arg2> , ExecSpace >
+  : public std::integral_constant
+      < bool ,
+      ( std::is_same< ExecSpace , void >::value ||
+        std::is_same< ExecSpace
+                    , typename Future<Arg1,Arg2>::execution_space >::value )
+      > {};
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -319,18 +338,59 @@ public:
 
 namespace Kokkos {
 
-enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
-              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
+enum class TaskPriority : int { High    = 0
+                              , Regular = 1
+                              , Low     = 2 };
 
-enum TaskPriority { TaskHighPriority    = 0
-                  , TaskRegularPriority = 1
-                  , TaskLowPriority     = 2 };
+} // namespace Kokkos
 
-template< typename Space >
-void wait( TaskScheduler< Space > const & );
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< int TaskEnum , typename DepFutureType >
+struct TaskPolicyData
+{
+  using execution_space = typename DepFutureType::execution_space ;
+  using scheduler_type  = TaskScheduler< execution_space > ;
+
+  enum : int { m_task_type = TaskEnum };
+
+  scheduler_type const * m_scheduler ;
+  DepFutureType  const   m_dependence ;
+  int                    m_priority ;
+
+  TaskPolicyData() = delete ;
+  TaskPolicyData( TaskPolicyData && ) = default ;
+  TaskPolicyData( TaskPolicyData const & ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData && ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( DepFutureType             && arg_future
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( 0 )
+    , m_dependence( arg_future )
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( scheduler_type       const & arg_scheduler
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( & arg_scheduler )
+    , m_dependence()
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+};
 
+} // namespace Impl
 } // namespace Kokkos
 
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -348,52 +408,13 @@ private:
   queue_type * m_queue ;
 
   //----------------------------------------
-  // Process optional arguments to spawn and respawn functions
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const ) {}
-
-  // TaskTeam or TaskSingle
-  template< typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , TaskType const & arg
-             , Options const & ... opts )
-    {
-      task->m_task_type = arg ;
-      assign( task , opts ... );
-    }
-
-  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
-  template< typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , TaskPriority const & arg
-             , Options const & ... opts )
-    {
-      task->m_priority = arg ;
-      assign( task , opts ... );
-    }
-
-  // Future for a dependence
-  template< typename A1 , typename A2 , typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , Future< A1 , A2 > const & arg
-             , Options const & ... opts )
-    {
-      task->add_dependence( arg.m_task );
-      assign( task , opts ... );
-    }
-
-  //----------------------------------------
 
 public:
 
-  using execution_policy = TaskScheduler ;
   using execution_space  = ExecSpace ;
   using memory_space     = typename queue_type::memory_space ;
-  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
+  using member_type      =
+    typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
 
   KOKKOS_INLINE_FUNCTION
   TaskScheduler() : m_track(), m_queue(0) {}
@@ -460,18 +481,13 @@ public:
 
   //----------------------------------------
 
-  /**\brief  A task spawns a task with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   *  3) Team or Serial
-   */
-  template< typename FunctorType , typename ... Options >
-  KOKKOS_FUNCTION
-  Future< typename FunctorType::value_type , ExecSpace >
-  task_spawn( FunctorType const & arg_functor
-            , Options const & ... arg_options
-            ) const
+  template< int TaskEnum , typename DepFutureType , typename FunctorType >
+  KOKKOS_FUNCTION static
+  Kokkos::Future< typename FunctorType::value_type , execution_space >
+  spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+       , typename task_base::function_type                    arg_function
+       , FunctorType                                       && arg_functor
+       )
     {
       using value_type  = typename FunctorType::value_type ;
       using future_type = Future< value_type , execution_space > ;
@@ -479,11 +495,21 @@ public:
                                         , value_type
                                         , FunctorType > ;
 
+      queue_type * const queue =
+        arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
+        arg_policy.m_dependence.m_task
+          ? arg_policy.m_dependence.m_task->m_queue
+          : (queue_type*) 0 );
+
+      if ( 0 == queue ) {
+        Kokkos::abort("Kokkos spawn given null Future" );
+      }
+
       //----------------------------------------
       // Give single-thread back-ends an opportunity to clear
       // queue of ready tasks before allocating a new task
 
-      m_queue->iff_single_thread_recursive_execute();
+      queue->iff_single_thread_recursive_execute();
 
       //----------------------------------------
 
@@ -491,176 +517,129 @@ public:
 
       // Allocate task from memory pool
       f.m_task =
-        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
+        reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
 
       if ( f.m_task ) {
 
         // Placement new construction
-        new ( f.m_task ) task_type( arg_functor );
-
-        // Reference count starts at two
-        // +1 for matching decrement when task is complete
-        // +1 for future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = sizeof(task_type);
-
-        assign( f.m_task , arg_options... );
-
-        // Spawning from within the execution space so the
-        // apply function pointer is guaranteed to be valid
-        f.m_task->m_apply = task_type::apply ;
-
-        m_queue->schedule( f.m_task );
-        // this task may be updated or executed at any moment
+        // Reference count starts at two:
+        //   +1 for the matching decrement when task is complete
+        //   +1 for the future
+        new ( f.m_task )
+          task_type( arg_function
+                   , queue
+                   , arg_policy.m_dependence.m_task /* dependence */
+                   , 2                              /* reference count */
+                   , int(sizeof(task_type))         /* allocation size */
+                   , int(arg_policy.m_task_type)
+                   , int(arg_policy.m_priority)
+                   , std::move(arg_functor) );
+
+        // The dependence (if any) is processed immediately
+        // within the schedule function, as such the dependence's
+        // reference count does not need to be incremented for
+        // the assignment.
+
+        queue->schedule_runnable( f.m_task );
+        // This task may be updated or executed at any moment,
+        // even during the call to 'schedule'.
       }
 
       return f ;
     }
 
-  /**\brief  The host process spawns a task with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   *  3) Team or Serial
-   */
-  template< typename FunctorType , typename ... Options >
-  inline
-  Future< typename FunctorType::value_type , ExecSpace >
-  host_spawn( FunctorType const & arg_functor
-            , Options const & ... arg_options
-            ) const
+  template< typename FunctorType , typename A1 , typename A2 >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , Future<A1,A2> const & arg_dependence
+         , TaskPriority  const & arg_priority
+         )
     {
+      // Precondition: task is in Executing state
+
       using value_type  = typename FunctorType::value_type ;
-      using future_type = Future< value_type , execution_space > ;
       using task_type   = Impl::TaskBase< execution_space
                                         , value_type
                                         , FunctorType > ;
 
-      if ( m_queue == 0 ) {
-        Kokkos::abort("Kokkos::TaskScheduler not initialized");
-      }
+      task_type * const task = static_cast< task_type * >( arg_self );
 
-      future_type f ;
+      task->m_priority = static_cast<int>(arg_priority);
 
-      // Allocate task from memory pool
-      f.m_task =
-        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
-
-      if ( f.m_task ) {
-
-        // Placement new construction
-        new( f.m_task ) task_type( arg_functor );
-
-        // Reference count starts at two:
-        // +1 to match decrement when task completes
-        // +1 for the future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = sizeof(task_type);
-
-        assign( f.m_task , arg_options... );
-
-        // Potentially spawning outside execution space so the
-        // apply function pointer must be obtained from execution space.
-        // Required for Cuda execution space function pointer.
-        m_queue->template proc_set_apply< FunctorType >( & f.m_task->m_apply );
+      task->add_dependence( arg_dependence.m_task );
 
-        m_queue->schedule( f.m_task );
-      }
-      return f ;
+      // Postcondition: task is in Executing-Respawn state
     }
 
+  //----------------------------------------
   /**\brief  Return a future that is complete
    *         when all input futures are complete.
    */
   template< typename A1 , typename A2 >
-  KOKKOS_FUNCTION
-  Future< ExecSpace >
-  when_all( int narg , Future< A1 , A2 > const * const arg ) const
+  KOKKOS_FUNCTION static
+  Future< execution_space >
+  when_all( Future< A1 , A2 > const arg[] , int narg )
     {
-      static_assert
-        ( std::is_same< execution_space
-                      , typename Future< A1 , A2 >::execution_space
-                      >::value
-        , "Future must have same execution space" );
-
-      using future_type = Future< ExecSpace > ;
-      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+      using future_type = Future< execution_space > ;
+      using task_base   = Kokkos::Impl::TaskBase< execution_space , void , void > ;
 
       future_type f ;
 
-      size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
-
-      f.m_task =
-        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
+      if ( narg ) {
 
-      if ( f.m_task ) {
-
-        new( f.m_task ) task_base();
-
-        // Reference count starts at two:
-        // +1 to match decrement when task completes
-        // +1 for the future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = size ;
-        f.m_task->m_dep_count  = narg ;
-        f.m_task->m_task_type  = task_base::Aggregate ;
-
-        task_base ** const dep = f.m_task->aggregate_dependences();
-
-        // Assign dependences to increment their reference count
-        // The futures may be destroyed upon returning from this call
-        // so increment reference count to track this assignment.
+        queue_type * queue = 0 ;
 
         for ( int i = 0 ; i < narg ; ++i ) {
-          task_base * const t = dep[i] = arg[i].m_task ;
+          task_base * const t = arg[i].m_task ;
           if ( 0 != t ) {
+            // Increment reference count to track subsequent assignment.
             Kokkos::atomic_increment( &(t->m_ref_count) );
+            if ( queue == 0 ) {
+              queue = t->m_queue ;
+            }
+            else if ( queue != t->m_queue ) {
+              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+            }
           }
         }
 
-        m_queue->schedule( f.m_task );
-        // this when_all may be processed at any moment
-      }
+        if ( queue != 0 ) {
 
-      return f ;
-    }
+          size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
 
-  /**\brief  An executing task respawns itself with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   */
-  template< class FunctorType , typename ... Options >
-  KOKKOS_FUNCTION
-  void respawn( FunctorType * task_self
-              , Options const & ... arg_options ) const
-    {
-      using value_type  = typename FunctorType::value_type ;
-      using task_type   = Impl::TaskBase< execution_space
-                                        , value_type
-                                        , FunctorType > ;
+          f.m_task =
+            reinterpret_cast< task_base * >( queue->allocate( size ) );
 
-      task_type * const task = static_cast< task_type * >( task_self );
+          if ( f.m_task ) {
 
-      // Reschedule task with no dependences.
-      m_queue->reschedule( task );
+            // Reference count starts at two:
+            // +1 to match decrement when task completes
+            // +1 for the future
+            new( f.m_task ) task_base( queue
+                                     , 2     /* reference count */
+                                     , size  /* allocation size */
+                                     , narg  /* dependence count */
+                                     );
 
-      // Dependences, if requested, are added here through parsing the arguments.
-      assign( task , arg_options... );
-    }
+            // Assign dependences, reference counts were already incremented
 
-  //----------------------------------------
+            task_base ** const dep = f.m_task->aggregate_dependences();
 
-  template< typename S >
-  friend
-  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
+            for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
+
+            queue->schedule_aggregate( f.m_task );
+            // this when_all may be processed at any moment
+          }
+        }
+      }
+
+      return f ;
+    }
 
   //----------------------------------------
 
-  inline
+  KOKKOS_INLINE_FUNCTION
   int allocation_capacity() const noexcept
     { return m_queue->m_memory.get_mem_size(); }
 
@@ -676,12 +655,192 @@ public:
   long allocated_task_count_accum() const noexcept
     { return m_queue->m_accum_alloc ; }
 
+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
+
 };
 
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Construct a TaskTeam execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskTeam( T            const & arg
+        , TaskPriority const & arg_priority = TaskPriority::Regular
+        )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskTeam argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+// Construct a TaskSingle execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskSingle( T            const & arg
+          , TaskPriority const & arg_priority = TaskPriority::Regular
+          )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskSingle argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+/**\brief  A host control thread spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+host_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  // May be spawning a Cuda task, must use the specialization
+  // to query on-device function pointer.
+  typename task_type::function_type const ptr =
+    Kokkos::Impl::TaskQueueSpecialization< exec_space >::
+      template get_function_pointer< task_type >();
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+KOKKOS_INLINE_FUNCTION
+task_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) && \
+    defined( KOKKOS_ENABLE_CUDA )
+
+  static_assert( ! std::is_same< Kokkos::Cuda , exec_space >::value
+               , "Error calling Kokkos::task_spawn for Cuda space within Host code" );
+
+#endif
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  typename task_type::function_type const ptr = task_type::apply ;
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task respawns itself with options
+ *
+ *  1) With scheduler or dependence
+ *  2) High, Normal, or Low priority
+ */
+template< typename FunctorType , typename T >
+void
+KOKKOS_INLINE_FUNCTION
+respawn( FunctorType         * arg_self
+       , T             const & arg
+       , TaskPriority  const & arg_priority = TaskPriority::Regular
+       )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos respawn argument must be Future or TaskScheduler" );
+
+  TaskScheduler< typename T::execution_space >::
+    respawn( arg_self , arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename A1 , typename A2 >
+KOKKOS_INLINE_FUNCTION
+Future< typename Future< A1 , A2 >::execution_space >
+when_all( Future< A1 , A2 > const arg[]
+        , int                     narg
+        )
+{
+  return TaskScheduler< typename Future<A1,A2>::execution_space >::
+    when_all( arg , narg );
+}
+
+//----------------------------------------------------------------------------
+// Wait for all runnable tasks to complete
+
 template< typename ExecSpace >
 inline
-void wait( TaskScheduler< ExecSpace > const & policy )
-{ policy.m_queue->execute(); }
+void wait( TaskScheduler< ExecSpace > const & scheduler )
+{ scheduler.m_queue->execute(); }
 
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
index aca482b427a11a21ecc5d71dddfffb715438fa85..8aa968d0535f1f6c32ac170a73d2ec60d018d824 100644
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -230,4 +230,3 @@ struct VerifyExecutionCanAccessMemorySpace
 #endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
 #endif /* #define KOKKOS_THREADS_HPP */
 
-
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
index 316f61fd4d9fcd4c7ce4ec37592659deef006bce..0668f89c86e040e5dd1017fc3c3f0a233e9affa3 100644
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@@ -31,23 +31,23 @@ KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
 CONDITIONAL_COPIES =
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
-	CONDITIONAL_COPIES += copy-cuda
+  KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+  CONDITIONAL_COPIES += copy-cuda
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
-	CONDITIONAL_COPIES += copy-threads
+  KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+  CONDITIONAL_COPIES += copy-threads
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_HEADERS_QTHREAD += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
-	CONDITIONAL_COPIES += copy-qthread
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  CONDITIONAL_COPIES += copy-qthreads
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
-	CONDITIONAL_COPIES += copy-openmp
+  KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+  CONDITIONAL_COPIES += copy-openmp
 endif
 
 ifeq ($(KOKKOS_OS),CYGWIN)
@@ -60,6 +60,12 @@ ifeq ($(KOKKOS_OS),Darwin)
   COPY_FLAG =
 endif
 
+ifeq ($(KOKKOS_DEBUG),"no")
+  KOKKOS_DEBUG_CMAKE = OFF
+else
+  KOKKOS_DEBUG_CMAKE = ON
+endif
+
 messages: 
 	echo "Start Build"
 
@@ -91,6 +97,7 @@ build-makefile-kokkos:
 	echo "" >> Makefile.kokkos
 	echo "#Internal settings which need to propagated for Kokkos examples" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_CUDA = ${KOKKOS_INTERNAL_USE_CUDA}" >> Makefile.kokkos
+	echo "KOKKOS_INTERNAL_USE_QTHREADS = ${KOKKOS_INTERNAL_USE_QTHREADS}" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_OPENMP = ${KOKKOS_INTERNAL_USE_OPENMP}" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_PTHREADS = ${KOKKOS_INTERNAL_USE_PTHREADS}" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
@@ -107,7 +114,55 @@ build-makefile-kokkos:
 		> Makefile.kokkos.tmp
 	mv -f Makefile.kokkos.tmp Makefile.kokkos
 
-build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
+build-cmake-kokkos:
+	rm -f kokkos.cmake
+	echo "#Global Settings used to generate this library" >> kokkos.cmake
+	echo "set(KOKKOS_PATH $(PREFIX) CACHE PATH \"Kokkos installation path\")" >> kokkos.cmake
+	echo "set(KOKKOS_DEVICES $(KOKKOS_DEVICES) CACHE STRING \"Kokkos devices list\")" >> kokkos.cmake
+	echo "set(KOKKOS_ARCH $(KOKKOS_ARCH) CACHE STRING \"Kokkos architecture flags\")" >> kokkos.cmake
+	echo "set(KOKKOS_DEBUG $(KOKKOS_DEBUG_CMAKE) CACHE BOOL \"Kokkos debug enabled ?)\")" >> kokkos.cmake
+	echo "set(KOKKOS_USE_TPLS $(KOKKOS_USE_TPLS) CACHE STRING \"Kokkos templates list\")" >> kokkos.cmake
+	echo "set(KOKKOS_CXX_STANDARD $(KOKKOS_CXX_STANDARD) CACHE STRING \"Kokkos C++ standard\")" >> kokkos.cmake
+	echo "set(KOKKOS_OPTIONS $(KOKKOS_OPTIONS) CACHE STRING \"Kokkos options\")" >> kokkos.cmake
+	echo "set(KOKKOS_CUDA_OPTIONS $(KOKKOS_CUDA_OPTIONS) CACHE STRING \"Kokkos Cuda options\")" >> kokkos.cmake
+	echo "if(NOT $ENV{CXX})" >> kokkos.cmake
+	echo '  message(WARNING "You are currently using compiler $${CMAKE_CXX_COMPILER} while Kokkos was built with $(CXX) ; make sure this is the behavior you intended to be.")' >> kokkos.cmake
+	echo "endif()" >> kokkos.cmake
+	echo "if(NOT DEFINED ENV{NVCC_WRAPPER})" >> kokkos.cmake
+	echo "  set(NVCC_WRAPPER \"$(NVCC_WRAPPER)\" CACHE FILEPATH \"Path to command nvcc_wrapper\")" >> kokkos.cmake
+	echo "else()" >> kokkos.cmake
+	echo '  set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
+	echo "endif()" >> kokkos.cmake
+	echo "" >> kokkos.cmake  
+	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
+	echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
+	echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
+	echo "" >> kokkos.cmake  
+	echo "#Variables used in application Makefiles" >> kokkos.cmake
+	echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_CPPFLAGS \"$(KOKKOS_CPPFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LINK_DEPENDS \"$(KOKKOS_LINK_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LIBS \"$(KOKKOS_LIBS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LDFLAGS \"$(KOKKOS_LDFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "" >> kokkos.cmake
+	echo "#Internal settings which need to propagated for Kokkos examples" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_CUDA \"${KOKKOS_INTERNAL_USE_CUDA}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_OPENMP \"${KOKKOS_INTERNAL_USE_OPENMP}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_PTHREADS \"${KOKKOS_INTERNAL_USE_PTHREADS}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)" >> kokkos.cmake
+	echo "" >> kokkos.cmake
+	sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+	 	-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+	 	-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+	 	-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+	 	-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+	 	-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' kokkos.cmake \
+	 	> kokkos.cmake.tmp
+	mv -f kokkos.cmake.tmp kokkos.cmake
+
+build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
 
 mkdir: 
 	mkdir -p $(PREFIX)
@@ -124,9 +179,9 @@ copy-threads: mkdir
 	mkdir -p $(PREFIX)/include/Threads
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
 
-copy-qthread: mkdir
-	mkdir -p $(PREFIX)/include/Qthread
-	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREAD) $(PREFIX)/include/Qthread
+copy-qthreads: mkdir
+	mkdir -p $(PREFIX)/include/Qthreads
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREADS) $(PREFIX)/include/Qthreads
 
 copy-openmp: mkdir
 	mkdir -p $(PREFIX)/include/OpenMP
@@ -137,6 +192,7 @@ install: mkdir $(CONDITIONAL_COPIES) build-lib
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
 	cp $(COPY_FLAG) Makefile.kokkos $(PREFIX)
+	cp $(COPY_FLAG) kokkos.cmake $(PREFIX)
 	cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib
 	cp $(COPY_FLAG) KokkosCore_config.h $(PREFIX)/include
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
index a61791ca9c7be2779820b5ed96db1aec02644654..ecacffb77331c9d14134dc2dcc9a8eafabbc175f 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -46,7 +46,6 @@
 
 #include <omp.h>
 #include <iostream>
-#include <Kokkos_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMPexec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
@@ -107,58 +106,41 @@ private:
 
 public:
 
-  inline void execute() const {
-    this->template execute_schedule<typename Policy::schedule_type::type>();
-  }
-
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
-    execute_schedule() const
+  inline void execute() const
     {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
 
-        ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
-      }
-/* END #pragma omp parallel */
-    }
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
 
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-    execute_schedule() const
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
 
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        std::pair<int64_t,int64_t> range(0,0);
 
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        do {
 
-        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
-        exec.reset_steal_target();
-        #pragma omp barrier
-        
-        long work_index = exec.get_work_index();
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
 
-        while(work_index != -1) {
-          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
-          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
-          ParallelFor::template exec_range< WorkTag >( m_functor , begin, end );
-          work_index = exec.get_work_index();
-        }
+          ParallelFor::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin() );
 
+        } while ( is_dynamic && 0 <= range.first );
       }
-/* END #pragma omp parallel */
+      // END #pragma omp parallel
     }
 
   inline
@@ -193,17 +175,18 @@ private:
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::member_type  Member ;
 
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
   // Static Assert WorkTag void if ReducerType not InvalidType
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
   typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -247,92 +230,70 @@ private:
 
 public:
 
-  inline void execute() const {
-    this->template execute_schedule<typename Policy::schedule_type::type>();
-  }
-
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
-    execute_schedule() const
+  inline void execute() const
     {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        ParallelReduce::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
-      }
-/* END #pragma omp parallel */
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
 
-      // Reduction:
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
 
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
 
-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
-      }
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );
 
-      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+        std::pair<int64_t,int64_t> range(0,0);
 
-      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        do {
 
-        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
-      }
-    }
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
 
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-    execute_schedule() const
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+          ParallelReduce::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin()
+                                 , update );
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
-
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-
-        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
-        exec.reset_steal_target();
-        #pragma omp barrier
-
-        long work_index = exec.get_work_index();
-
-        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
-        while(work_index != -1) {
-          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
-          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
-          ParallelReduce::template exec_range< WorkTag >
-            ( m_functor , begin,end
-            , update );
-          work_index = exec.get_work_index();
-        }
+        } while ( is_dynamic && 0 <= range.first );
       }
-/* END #pragma omp parallel */
+// END #pragma omp parallel
 
       // Reduction:
 
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
 
       for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
       }
 
       Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
 
         for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
       }
@@ -394,17 +355,18 @@ private:
 
   typedef Kokkos::RangePolicy< Traits ... > Policy ;
 
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
   typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
   typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -452,53 +414,63 @@ public:
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
 
-      OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+      const int    value_count       = Analysis::value_count( m_functor );
+      const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
+
+      OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        const pointer_type ptr =
-          pointer_type( exec.scratch_reduce() ) +
-          ValueTraits::value_count( m_functor );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+
+        const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
+
+        reference_type update_sum =
+          ValueInit::init( m_functor , data.pool_reduce_local() );
+
         ParallelScan::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( m_functor , ptr ) , false );
-      }
-/* END #pragma omp parallel */
+          ( m_functor , range.begin() , range.end() , update_sum , false );
 
-      {
-        const unsigned thread_count = OpenMPexec::pool_size();
-        const unsigned value_count  = ValueTraits::value_count( m_functor );
+        if ( data.pool_rendezvous() ) {
 
-        pointer_type ptr_prev = 0 ;
+          pointer_type ptr_prev = 0 ;
 
-        for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+          const int n = data.pool_size();
 
-          pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+          for ( int i = 0 ; i < n ; ++i ) {
 
-          if ( ptr_prev ) {
-            for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
-            ValueJoin::join( m_functor , ptr + value_count , ptr );
-          }
-          else {
-            ValueInit::init( m_functor , ptr );
+            pointer_type ptr = (pointer_type)
+              data.pool_member(i)->pool_reduce_local();
+
+            if ( i ) {
+              for ( int j = 0 ; j < value_count ; ++j ) {
+                ptr[j+value_count] = ptr_prev[j+value_count] ;
+              }
+              ValueJoin::join( m_functor , ptr + value_count , ptr_prev );
+            }
+            else {
+              ValueInit::init( m_functor , ptr + value_count );
+            }
+
+            ptr_prev = ptr ;
           }
 
-          ptr_prev = ptr ;
+          data.pool_rendezvous_release();
         }
-      }
 
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        const pointer_type ptr = pointer_type( exec.scratch_reduce() );
+        reference_type update_base =
+          ValueOps::reference
+            ( ((pointer_type)data.pool_reduce_local()) + value_count );
+
         ParallelScan::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueOps::reference( ptr ) , true );
+          ( m_functor , range.begin() , range.end() , update_base , true );
       }
 /* END #pragma omp parallel */
+
     }
 
   //----------------------------------------
@@ -530,55 +502,59 @@ class ParallelFor< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
-  typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
+  typedef typename Policy::member_type          Member ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
   const int          m_shmem_size ;
 
-  template< class TagType, class Schedule >
+  template< class TagType >
   inline static
-  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Static>::value>::type
-  exec_team( const FunctorType & functor , Member member )
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( member );
-      }
-    }
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
 
-  template< class TagType, class Schedule >
-  inline static
-  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Static>::value >::type
-  exec_team( const FunctorType & functor , Member member )
-    {
-      const TagType t{} ;
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( t , member );
-      }
-    }
+        functor( Member( data, r , league_size ) );
 
-  template< class TagType, class Schedule >
-  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Dynamic>::value>::type
-  exec_team( const FunctorType & functor , Member member )
-    {
-      #pragma omp barrier
-      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
-        functor( member );
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
-  template< class TagType, class Schedule >
+
+  template< class TagType >
   inline static
-  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-  exec_team( const FunctorType & functor , Member member )
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      #pragma omp barrier
-      const TagType t{} ;
-      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
-        functor( t , member );
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
@@ -587,31 +563,75 @@ public:
   inline
   void execute() const
     {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
 
-      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+      const size_t pool_reduce_size = 0 ; // Never shrinks
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
 
-      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
+      OpenMPexec::resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
 
 #pragma omp parallel
       {
-        ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
-          ( m_functor
-          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+
+        const int active = data.organize_team( m_policy.team_size() );
+
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
+
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        if ( active ) {
+
+          std::pair<int64_t,int64_t> range(0,0);
+
+          do {
+
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelFor::template exec_team< WorkTag >
+              ( m_functor , data
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        }
+
+        data.disband_team();
       }
-/* END #pragma omp parallel */
+// END #pragma omp parallel
     }
 
+
   inline
   ParallelFor( const FunctorType & arg_functor ,
                const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
-    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
     {}
 };
 
+//----------------------------------------------------------------------------
 
 template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
@@ -622,20 +642,26 @@ class ParallelReduce< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... >         Policy ;
 
-  typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
+  typedef typename Policy::member_type          Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value
+                            , FunctorType, ReducerType> ReducerConditional;
 
-  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
   typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
@@ -645,22 +671,48 @@ private:
 
   template< class TagType >
   inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_team( const FunctorType & functor , Member member , reference_type update )
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( member , update );
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
+
   template< class TagType >
   inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_team( const FunctorType & functor , Member member , reference_type update )
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      const TagType t{} ;
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( t , member , update );
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
@@ -669,44 +721,89 @@ public:
   inline
   void execute() const
     {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
 
-      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
 
-        ParallelReduce::template exec_team< WorkTag >
-          ( m_functor
-          , Member( exec , m_policy , m_shmem_size, 0 )
-          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
-      }
-/* END #pragma omp parallel */
+        const int active = data.organize_team( m_policy.team_size() );
 
-      {
-        const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
-
-        int max_active_threads = OpenMPexec::pool_size();
-        if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
-          max_active_threads = m_policy.league_size()* m_policy.team_size();
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
 
-        for ( int i = 1 ; i < max_active_threads ; ++i ) {
-          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
         }
 
-        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+        if ( active ) {
+          reference_type update =
+            ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
+
+          std::pair<int64_t,int64_t> range(0,0);
 
-        if ( m_result_ptr ) {
-          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+          do {
 
-          for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelReduce::template exec_team< WorkTag >
+              ( m_functor , data , update
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        } else {
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
         }
+
+        data.disband_team();
+      }
+// END #pragma omp parallel
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
       }
     }
 
+  //----------------------------------------
+
   template< class ViewType >
   inline
   ParallelReduce( const FunctorType  & arg_functor ,
@@ -720,7 +817,10 @@ public:
     , m_policy(  arg_policy )
     , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
     {}
 
   inline
@@ -731,7 +831,10 @@ public:
   , m_policy(  arg_policy )
   , m_reducer( reducer )
   , m_result_ptr(  reducer.result_view().data() )
-  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  , m_shmem_size( arg_policy.scratch_size(0) +
+                  arg_policy.scratch_size(1) +
+                  FunctorTeamShmemSize< FunctorType >
+                    ::value( arg_functor , arg_policy.team_size() ) )
   {
   /*static_assert( std::is_same< typename ViewType::memory_space
                           , Kokkos::HostSpace >::value
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
index 5b3e9873e17bc360f28a8338b7b59b69cf627ec3..9144d8c2799a7db81af0886aafcff1ebcd828833 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -46,6 +46,7 @@
 #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
 
 #include <impl/Kokkos_TaskQueue_impl.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -55,231 +56,214 @@ namespace Impl {
 
 template class TaskQueue< Kokkos::OpenMP > ;
 
-//----------------------------------------------------------------------------
-
-TaskExec< Kokkos::OpenMP >::
-TaskExec()
-  : m_self_exec( 0 )
-  , m_team_exec( 0 )
-  , m_sync_mask( 0 )
-  , m_sync_value( 0 )
-  , m_sync_step( 0 )
-  , m_group_rank( 0 )
-  , m_team_rank( 0 )
-  , m_team_size( 1 )
-{
-}
-
-TaskExec< Kokkos::OpenMP >::
-TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
-  : m_self_exec( & arg_exec )
-  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
-  , m_sync_mask( 0 )
-  , m_sync_value( 0 )
-  , m_sync_step( 0 )
-  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
-  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
-  , m_team_size(  arg_team_size )
-{
-  // This team spans
-  //    m_self_exec->pool_rev( team_size * group_rank )
-  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
-
-  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
-
-  sync[0] = int64_t(0) ;
-  sync[1] = int64_t(0) ;
-
-  for ( int i = 0 ; i < m_team_size ; ++i ) {
-    m_sync_value |= int64_t(1) << (8*i);
-    m_sync_mask  |= int64_t(3) << (8*i);
-  }
+class HostThreadTeamDataSingleton : private HostThreadTeamData {
+private:
+
+  HostThreadTeamDataSingleton() : HostThreadTeamData()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      const size_t num_pool_reduce_bytes  =   32 ;
+      const size_t num_team_reduce_bytes  =   32 ;
+      const size_t num_team_shared_bytes  = 1024 ;
+      const size_t num_thread_local_bytes = 1024 ;
+      const size_t alloc_bytes =
+        HostThreadTeamData::scratch_size( num_pool_reduce_bytes
+                                        , num_team_reduce_bytes
+                                        , num_team_shared_bytes
+                                        , num_thread_local_bytes );
+
+      HostThreadTeamData::scratch_assign
+        ( space.allocate( alloc_bytes )
+        , alloc_bytes
+        , num_pool_reduce_bytes
+        , num_team_reduce_bytes
+        , num_team_shared_bytes
+        , num_thread_local_bytes );
+    }
+
+  ~HostThreadTeamDataSingleton()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      space.deallocate( HostThreadTeamData::scratch_buffer()
+                      , HostThreadTeamData::scratch_bytes() );
+    }
+
+public:
+
+  static HostThreadTeamData & singleton()
+    {
+      static HostThreadTeamDataSingleton s ;
+      return s ;
+    }
+};
 
-  Kokkos::memory_fence();
-}
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+//----------------------------------------------------------------------------
 
-void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
+void TaskQueueSpecialization< Kokkos::OpenMP >::execute
+  ( TaskQueue< Kokkos::OpenMP > * const queue )
 {
-  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
-    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
-  }
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
-  // Use team shared memory to synchronize.
-  // Alternate memory locations between barriers to avoid a sequence
-  // of barriers overtaking one another.
+  static task_root_type * const end =
+    (task_root_type *) task_root_type::EndTag ;
 
-  int64_t volatile * const sync =
-    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+  HostThreadTeamData & team_data_single =
+    HostThreadTeamDataSingleton::singleton();
 
-  // This team member sets one byte within the sync variable
-  int8_t volatile * const sync_self =
-   ((int8_t *) sync) + m_team_rank ;
+  const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
+  // const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
 
 #if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
+fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
 fflush(stdout);
 #endif
 
-  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
 
-  while ( m_sync_value != *sync ); // wait for team to arrive
+#pragma omp parallel
+  {
+    Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
 
-  ++m_sync_step ;
+    if ( self.organize_team( team_size ) ) {
 
-  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
-    m_sync_value ^= m_sync_mask ;
-    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
-  }
-}
+      Member single_exec( team_data_single );
+      Member team_exec( self );
 
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       );
+fflush(stdout);
 #endif
 
-//----------------------------------------------------------------------------
-
-void TaskQueueSpecialization< Kokkos::OpenMP >::execute
-  ( TaskQueue< Kokkos::OpenMP > * const queue )
-{
-  using execution_space = Kokkos::OpenMP ;
-  using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using PoolExec        = Kokkos::Impl::OpenMPexec ;
-  using Member          = TaskExec< execution_space > ;
+      // Loop until all queues are empty and no tasks in flight
 
-  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+      task_root_type * task = 0 ;
 
-  // Required:  team_size <= 8
+      do {
+        // Each team lead attempts to acquire either a thread team task
+        // or a single thread task for the team.
 
-  const int team_size = PoolExec::pool_size(2); // Threads per core
-  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+        if ( 0 == team_exec.team_rank() ) {
 
-  if ( 8 < team_size ) {
-    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
-  }
+          bool leader_loop = false ;
 
-#pragma omp parallel
-  {
-    PoolExec & self = *PoolExec::get_thread_omp();
+          do {
 
-    Member single_exec ;
-    Member team_exec( self , team_size );
+            if ( 0 != task && end != task ) {
+              // team member #0 completes the previously executed task,
+              // completion may delete the task
+              queue->complete( task ); 
+            }
 
-    // Team shared memory
-    task_root_type * volatile * const task_shared =
-      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+            // If 0 == m_ready_count then set task = 0
 
-// Barrier across entire OpenMP thread pool to insure initialization
-#pragma omp barrier
+            task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
 
-    // Loop until all queues are empty and no tasks in flight
+            // Attempt to acquire a task
+            // Loop by priority and then type
+            for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+              for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+                task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
+              }
+            }
 
-    do {
+            // If still tasks are still executing
+            // and no task could be acquired
+            // then continue this leader loop
+            leader_loop = end == task ;
 
-      task_root_type * task = 0 ;
+            if ( ( ! leader_loop ) &&
+                 ( 0 != task ) &&
+                 ( task_root_type::TaskSingle == task->m_task_type ) ) {
 
-      // Each team lead attempts to acquire either a thread team task
-      // or a single thread task for the team.
+              // if a single thread task then execute now
 
-      if ( 0 == team_exec.team_rank() ) {
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , int64_t(task)
+       );
+fflush(stdout);
+#endif
 
-        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+              (*task->m_apply)( task , & single_exec );
 
-        // Loop by priority and then type
-        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
-          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-            task = queue_type::pop_task( & queue->m_ready[i][j] );
-          }
+              leader_loop = true ;
+            }
+          } while ( leader_loop );
         }
-      }
-
-      // Team lead broadcast acquired task to team members:
-
-      if ( 1 < team_exec.team_size() ) {
-
-        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
-
-        // Fence to be sure task_shared is stored before the barrier
-        Kokkos::memory_fence();
 
-        // Whole team waits for every team member to reach this statement
-        team_exec.team_barrier();
+        // Team lead either found 0 == m_ready_count or a team task
+        // Team lead broadcast acquired task:
 
-        // Fence to be sure task_shared is stored
-        Kokkos::memory_fence();
+        team_exec.team_broadcast( task , 0);
 
-        task = *task_shared ;
-      }
+        if ( 0 != task ) { // Thread Team Task
 
 #if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       , int64_t(task)
        );
 fflush(stdout);
 #endif
 
-      if ( 0 == task ) break ; // 0 == m_ready_count
-
-      if ( end == task ) {
-        // All team members wait for whole team to reach this statement.
-        // Is necessary to prevent task_shared from being updated
-        // before it is read by all threads.
-        team_exec.team_barrier();
-      }
-      else if ( task_root_type::TaskTeam == task->m_task_type ) {
-        // Thread Team Task
-        (*task->m_apply)( task , & team_exec );
+          (*task->m_apply)( task , & team_exec );
 
-        // The m_apply function performs a barrier
-
-        if ( 0 == team_exec.team_rank() ) {
-          // team member #0 completes the task, which may delete the task
-          queue->complete( task ); 
+          // The m_apply function performs a barrier
         }
-      }
-      else {
-        // Single Thread Task
+      } while( 0 != task );
 
-        if ( 0 == team_exec.team_rank() ) {
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       );
+fflush(stdout);
+#endif
 
-          (*task->m_apply)( task , & single_exec );
+    }
 
-          queue->complete( task ); 
-        }
+    self.disband_team();
+
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
+       , self.pool_rank()
+       , self.pool_size()
+       );
+fflush(stdout);
+#endif
 
-        // All team members wait for whole team to reach this statement.
-        // Not necessary to complete the task.
-        // Is necessary to prevent task_shared from being updated
-        // before it is read by all threads.
-        team_exec.team_barrier();
-      }
-    } while(1);
   }
 // END #pragma omp parallel
 
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
+fflush(stdout);
+#endif
+
 }
 
 void TaskQueueSpecialization< Kokkos::OpenMP >::
@@ -289,13 +273,16 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = TaskQueue< execution_space > ;
   using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   if ( 1 == omp_get_num_threads() ) {
 
     task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-    Member single_exec ;
+    HostThreadTeamData & team_data_single =
+      HostThreadTeamDataSingleton::singleton();
+
+    Member single_exec( team_data_single );
 
     task_root_type * task = end ;
 
@@ -306,7 +293,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
       // Loop by priority and then type
       for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
         for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-          task = queue_type::pop_task( & queue->m_ready[i][j] );
+          task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
         }
       }
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 15dbb77c26c7432497417b0b27508b00d3d717af..3cfdf790bfb75165b936ce547828fd7f248f0b00 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -60,6 +60,7 @@ public:
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
   using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
 
   // Must specify memory space
   using memory_space = Kokkos::HostSpace ;
@@ -70,296 +71,19 @@ public:
   // Must provide task queue execution function
   static void execute( queue_type * const );
 
-  // Must provide mechanism to set function pointer in
-  // execution space from the host process.
-  template< typename FunctorType >
+  template< typename TaskType >
   static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::OpenMP
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };
 
 extern template class TaskQueue< Kokkos::OpenMP > ;
 
-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::OpenMP >
-{
-private:
-
-  TaskExec( TaskExec && ) = delete ;
-  TaskExec( TaskExec const & ) = delete ;
-  TaskExec & operator = ( TaskExec && ) = delete ;
-  TaskExec & operator = ( TaskExec const & ) = delete ;
-
-
-  using PoolExec = Kokkos::Impl::OpenMPexec ;
-
-  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
-  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
-
-  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
-  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
-  int64_t          m_sync_mask ;
-  int64_t mutable  m_sync_value ;
-  int     mutable  m_sync_step ;
-  int              m_group_rank ; ///< Which "team" subset of thread pool
-  int              m_team_rank ;  ///< Which thread within a team
-  int              m_team_size ;
-
-  TaskExec();
-  TaskExec( PoolExec & arg_exec , int arg_team_size );
-
-  void team_barrier_impl() const ;
-
-public:
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  void * team_shared() const
-    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
-
-  int team_shared_size() const
-    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
-
-  /**\brief  Whole team enters this function call
-   *         before any teeam member returns from
-   *         this function call.
-   */
-  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
-#else
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
-#endif
-
-  KOKKOS_INLINE_FUNCTION
-  int team_rank() const { return m_team_rank ; }
-
-  KOKKOS_INLINE_FUNCTION
-  int team_size() const { return m_team_size ; }
-};
-
 }} /* namespace Kokkos::Impl */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
-TeamThreadRange
-  ( Impl::TaskExec< Kokkos::OpenMP > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
-}
-
-template<typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::OpenMP > >
-TeamThreadRange
-  ( Impl:: TaskExec< Kokkos::OpenMP > & thread, const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type<iType1, iType2>::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::OpenMP > >(thread, begin, end);
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::OpenMP > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
-}
-
-/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team.
- * This functionality requires C++11 support.
-*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
-  , const Lambda& lambda
-  )
-{
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i);
-  }
-}
-
-template<typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
-  , const Lambda& lambda
-  , ValueType& initialized_result)
-{
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i, result);
-  }
-
-  if ( 1 < loop_boundaries.thread.team_size() ) {
-
-    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-
-    loop_boundaries.thread.team_barrier();
-    shared[team_rank] = result;
-
-    loop_boundaries.thread.team_barrier();
-
-    // reduce across threads to thread 0
-    if (team_rank == 0) {
-      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
-        shared[0] += shared[i];
-      }
-    }
-
-    loop_boundaries.thread.team_barrier();
-
-    // broadcast result
-    initialized_result = shared[0];
-  }
-  else {
-    initialized_result = result ;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i, result);
-  }
-
-  if ( 1 < loop_boundaries.thread.team_size() ) {
-    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-
-    loop_boundaries.thread.team_barrier();
-    shared[team_rank] = result;
-
-    loop_boundaries.thread.team_barrier();
-
-    // reduce across threads to thread 0
-    if (team_rank == 0) {
-      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
-        join(shared[0], shared[i]);
-      }
-    }
-
-    loop_boundaries.thread.team_barrier();
-
-    // broadcast result
-    initialized_result = shared[0];
-  }
-  else {
-    initialized_result = result ;
-  }
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-  int team_size = loop_boundaries.thread.team_size();
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-
-  // Intra-member scan
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-  shared[team_rank] = accum;
-  loop_boundaries.thread.team_barrier();
-
-  // Member 0 do scan on accumulated totals
-  if (team_rank == 0) {
-    for( iType i = 1; i < team_size; i+=1) {
-      shared[i] += shared[i-1];
-    }
-    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
-  }
-
-  loop_boundaries.thread.team_barrier();
-
-  // Inter-member scan adding in accumulated totals
-  if (team_rank != 0) { accum = shared[team_rank-1]; }
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
index 34cf581a4796feb2e8b3d8a3f57343148ac955d9..2d50c6e54886087deea707d0dbb155566ed51428 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@@ -86,7 +86,7 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 
 int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
 
-OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 
 void OpenMPexec::verify_is_process( const char * const label )
 {
@@ -113,67 +113,110 @@ void OpenMPexec::verify_initialized( const char * const label )
 
 }
 
-void OpenMPexec::clear_scratch()
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void OpenMPexec::clear_thread_data()
 {
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
+
+  const int old_alloc_bytes =
+    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
+
+  Kokkos::HostSpace space ;
+
 #pragma omp parallel
   {
-    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
-    if ( m_pool[ rank_rev ] ) {
-      Record * const r = Record::get_record( m_pool[ rank_rev ] );
-      m_pool[ rank_rev ] = 0 ;
-      Record::decrement( r );
+    const int rank = m_map_rank[ omp_get_thread_num() ];
+
+    if ( 0 != m_pool[rank] ) {
+
+      m_pool[rank]->disband_pool();
+
+      space.deallocate( m_pool[rank] , old_alloc_bytes );
+
+      m_pool[rank] = 0 ;
     }
   }
 /* END #pragma omp parallel */
 }
 
-void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
-  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
 
-  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
-  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+  HostThreadTeamData * root = m_pool[0] ;
 
-  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
+  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
+  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
+  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
+  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;
 
-  // Requesting allocation and old allocation is too small:
+  // Allocate if any of the old allocation is tool small:
 
-  const bool allocate = ( old_reduce_size < reduce_size ) ||
-                        ( old_thread_size < thread_size );
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
 
   if ( allocate ) {
-    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
-    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
-  }
 
-  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
-  const int    pool_size  = m_pool_topo[0] ;
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
 
-  if ( allocate ) {
+    const size_t alloc_bytes =
+      member_bytes +
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    const int pool_size = omp_get_max_threads();
 
-    clear_scratch();
+    Kokkos::HostSpace space ;
 
 #pragma omp parallel
     {
-      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-      const int rank     = pool_size - ( rank_rev + 1 );
+      const int rank = m_map_rank[ omp_get_thread_num() ];
 
-      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+      if ( 0 != m_pool[rank] ) {
 
-      Record * const r = Record::allocate( Kokkos::HostSpace()
-                                         , "openmp_scratch"
-                                         , alloc_size );
+        m_pool[rank]->disband_pool();
 
-      Record::increment( r );
+        space.deallocate( m_pool[rank] , old_alloc_bytes );
+      }
+
+      void * const ptr = space.allocate( alloc_bytes );
 
-      m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
+      m_pool[ rank ] = new( ptr ) HostThreadTeamData();
 
-      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+      m_pool[ rank ]->
+        scratch_assign( ((char *)ptr) + member_bytes
+                      , alloc_bytes
+                      , pool_reduce_bytes
+                      , team_reduce_bytes
+                      , team_shared_bytes
+                      , thread_local_bytes );
     }
 /* END #pragma omp parallel */
+
+    HostThreadTeamData::organize_pool( m_pool , pool_size );
   }
 }
 
@@ -197,14 +240,14 @@ void OpenMP::initialize( unsigned thread_count ,
   // Before any other call to OMP query the maximum number of threads
   // and save the value for re-initialization unit testing.
 
-  //Using omp_get_max_threads(); is problematic in conjunction with
-  //Hwloc on Intel (essentially an initial call to the OpenMP runtime
-  //without a parallel region before will set a process mask for a single core
-  //The runtime will than bind threads for a parallel region to other cores on the
-  //entering the first parallel region and make the process mask the aggregate of
-  //the thread masks. The intend seems to be to make serial code run fast, if you
-  //compile with OpenMP enabled but don't actually use parallel regions or so
-  //static int omp_max_threads = omp_get_max_threads();
+  // Using omp_get_max_threads(); is problematic in conjunction with
+  // Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
   int nthreads = 0;
   #pragma omp parallel
   {
@@ -268,8 +311,6 @@ void OpenMP::initialize( unsigned thread_count ,
         // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
         // Call to 'new' may not be thread safe as well.
 
-        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
-
         const unsigned omp_rank    = omp_get_thread_num();
         const unsigned thread_r    = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
                                    ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
@@ -286,7 +327,19 @@ void OpenMP::initialize( unsigned thread_count ,
       Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
       Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
 
-      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+      // New, unified host thread team data:
+      {
+        size_t pool_reduce_bytes  =   32 * thread_count ;
+        size_t team_reduce_bytes  =   32 * thread_count ;
+        size_t team_shared_bytes  = 1024 * thread_count ;
+        size_t thread_local_bytes = 1024 ;
+
+        Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                            , team_reduce_bytes
+                                            , team_shared_bytes
+                                            , thread_local_bytes
+                                            );
+      }
     }
   }
 
@@ -309,7 +362,7 @@ void OpenMP::initialize( unsigned thread_count ,
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
 }
@@ -321,7 +374,8 @@ void OpenMP::finalize()
   Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
   Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
 
-  Impl::OpenMPexec::clear_scratch();
+  // New, unified host thread team data:
+  Impl::OpenMPexec::clear_thread_data();
 
   Impl::OpenMPexec::m_pool_topo[0] = 0 ;
   Impl::OpenMPexec::m_pool_topo[1] = 0 ;
@@ -333,7 +387,7 @@ void OpenMP::finalize()
     hwloc::unbind_this_thread();
   }
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
   #endif
 }
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
index 63f7234da3a81a5e040f76e264377156cf024bb0..39ace3131927d8071c50fc44dedb046bf598f0de 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@@ -44,13 +44,22 @@
 #ifndef KOKKOS_OPENMPEXEC_HPP
 #define KOKKOS_OPENMPEXEC_HPP
 
+#include <Kokkos_OpenMP.hpp>
+
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
+
 #include <iostream>
 #include <sstream>
 #include <fstream>
+
+#include <omp.h>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
 namespace Kokkos {
 namespace Impl {
 
@@ -60,41 +69,19 @@ namespace Impl {
 class OpenMPexec {
 public:
 
+  friend class Kokkos::OpenMP ;
+
   enum { MAX_THREAD_COUNT = 4096 };
 
 private:
 
-  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
-
   static int          m_pool_topo[ 4 ];
   static int          m_map_rank[ MAX_THREAD_COUNT ];
 
-  friend class Kokkos::OpenMP ;
-
-  int const  m_pool_rank ;
-  int const  m_pool_rank_rev ;
-  int const  m_scratch_exec_end ;
-  int const  m_scratch_reduce_end ;
-  int const  m_scratch_thread_end ;
-
-  int volatile  m_barrier_state ;
-
-  // Members for dynamic scheduling
-  // Which thread am I stealing from currently
-  int m_current_steal_target;
-  // This thread's owned work_range
-  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
-  // Team Offset if one thread determines work_range for others
-  long m_team_work_index;
+  static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
 
-  // Is this thread stealing (i.e. its owned work_range is exhausted
-  bool m_stealing;
-
-  OpenMPexec();
-  OpenMPexec( const OpenMPexec & );
-  OpenMPexec & operator = ( const OpenMPexec & );
-
-  static void clear_scratch();
+  static
+  void clear_thread_data();
 
 public:
 
@@ -108,47 +95,9 @@ public:
   inline static
   int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
 
-  inline static
-  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
-
-  inline int pool_rank() const { return m_pool_rank ; }
-  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
-
-  inline long team_work_index() const { return m_team_work_index ; }
-
-  inline int scratch_reduce_size() const
-    { return m_scratch_reduce_end - m_scratch_exec_end ; }
-
-  inline int scratch_thread_size() const
-    { return m_scratch_thread_end - m_scratch_reduce_end ; }
-
-  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
-  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
-
-  inline
-  void state_wait( int state )
-    { Impl::spinwait( m_barrier_state , state ); }
-
-  inline
-  void state_set( int state ) { m_barrier_state = state ; }
-
-  ~OpenMPexec() {}
-
-  OpenMPexec( const int arg_poolRank
-            , const int arg_scratch_exec_size
-            , const int arg_scratch_reduce_size
-            , const int arg_scratch_thread_size )
-    : m_pool_rank( arg_poolRank )
-    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
-    , m_scratch_exec_end( arg_scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
-    , m_barrier_state(0)
-    {}
-
   static void finalize();
 
-  static void initialize( const unsigned  team_count ,
+  static void initialize( const unsigned team_count ,
                           const unsigned threads_per_team ,
                           const unsigned numa_count ,
                           const unsigned cores_per_numa );
@@ -156,133 +105,20 @@ public:
   static void verify_is_process( const char * const );
   static void verify_initialized( const char * const );
 
-  static void resize_scratch( size_t reduce_size , size_t thread_size );
 
-  inline static
-  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+  static
+  void resize_thread_data( size_t pool_reduce_bytes
+                         , size_t team_reduce_bytes
+                         , size_t team_shared_bytes
+                         , size_t thread_local_bytes );
 
-  /* Dynamic Scheduling related functionality */
-  // Initialize the work range for this thread
-  inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
-    m_work_range.first = (begin+chunk_size-1)/chunk_size;
-    m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
-  }
-
-  // Claim and index from this thread's range from the beginning
-  inline long get_work_index_begin () {
-    Kokkos::pair<long,long> work_range_new = m_work_range;
-    Kokkos::pair<long,long> work_range_old = work_range_new;
-    if(work_range_old.first>=work_range_old.second)
-      return -1;
-
-    work_range_new.first+=1;
-
-    bool success = false;
-    while(!success) {
-      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
-      success = ( (work_range_new == work_range_old) ||
-                  (work_range_new.first>=work_range_new.second));
-      work_range_old = work_range_new;
-      work_range_new.first+=1;
-    }
-    if(work_range_old.first<work_range_old.second)
-      return work_range_old.first;
-    else
-      return -1;
-  }
-
-  // Claim and index from this thread's range from the end
-  inline long get_work_index_end () {
-    Kokkos::pair<long,long> work_range_new = m_work_range;
-    Kokkos::pair<long,long> work_range_old = work_range_new;
-    if(work_range_old.first>=work_range_old.second)
-      return -1;
-    work_range_new.second-=1;
-    bool success = false;
-    while(!success) {
-      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
-      success = ( (work_range_new == work_range_old) ||
-                  (work_range_new.first>=work_range_new.second) );
-      work_range_old = work_range_new;
-      work_range_new.second-=1;
-    }
-    if(work_range_old.first<work_range_old.second)
-      return work_range_old.second-1;
-    else
-      return -1;
-  }
-
-  // Reset the steal target
-  inline void reset_steal_target() {
-    m_current_steal_target = (m_pool_rank+1)%m_pool_topo[0];
-    m_stealing = false;
-  }
-
-  // Reset the steal target
-  inline void reset_steal_target(int team_size) {
-    m_current_steal_target = (m_pool_rank_rev+team_size);
-    if(m_current_steal_target>=m_pool_topo[0])
-      m_current_steal_target = 0;//m_pool_topo[0]-1;
-    m_stealing = false;
-  }
-
-  // Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
-  // Returns -1 fi no active steal target available
-  inline int get_steal_target() {
-    while(( m_pool[m_current_steal_target]->m_work_range.second <=
-            m_pool[m_current_steal_target]->m_work_range.first  ) &&
-          (m_current_steal_target!=m_pool_rank) ) {
-      m_current_steal_target = (m_current_steal_target+1)%m_pool_topo[0];
-    }
-    if(m_current_steal_target == m_pool_rank)
-      return -1;
-    else
-      return m_current_steal_target;
-  }
-
-  inline int get_steal_target(int team_size) {
-
-    while(( m_pool[m_current_steal_target]->m_work_range.second <=
-            m_pool[m_current_steal_target]->m_work_range.first  ) &&
-          (m_current_steal_target!=m_pool_rank_rev) ) {
-      if(m_current_steal_target + team_size < m_pool_topo[0])
-        m_current_steal_target = (m_current_steal_target+team_size);
-      else
-        m_current_steal_target = 0;
-    }
-
-    if(m_current_steal_target == m_pool_rank_rev)
-      return -1;
-    else
-      return m_current_steal_target;
-  }
-
-  inline long steal_work_index (int team_size = 0) {
-    long index = -1;
-    int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
-    while ( (steal_target != -1) && (index == -1)) {
-      index = m_pool[steal_target]->get_work_index_end();
-      if(index == -1)
-        steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
-    }
-    return index;
-  }
-
-  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
-  inline long get_work_index (int team_size = 0) {
-    long work_index = -1;
-    if(!m_stealing) work_index = get_work_index_begin();
-
-    if( work_index == -1) {
-      memory_fence();
-      m_stealing = true;
-      work_index = steal_work_index(team_size);
-    }
-    m_team_work_index = work_index;
-    memory_fence();
-    return work_index;
-  }
+  inline static
+  HostThreadTeamData * get_thread_data() noexcept
+    { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
 
+  inline static
+  HostThreadTeamData * get_thread_data( int i ) noexcept
+    { return m_pool[i]; }
 };
 
 } // namespace Impl
@@ -294,356 +130,6 @@ public:
 namespace Kokkos {
 namespace Impl {
 
-class OpenMPexecTeamMember {
-public:
-
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  /** \brief  Thread states for team synchronization */
-  enum { Active = 0 , Rendezvous = 1 };
-
-  typedef Kokkos::OpenMP                         execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-  Impl::OpenMPexec    & m_exec ;
-  scratch_memory_space  m_team_shared ;
-  int                   m_team_scratch_size[2] ;
-  int                   m_team_base_rev ;
-  int                   m_team_rank_rev ;
-  int                   m_team_rank ;
-  int                   m_team_size ;
-  int                   m_league_rank ;
-  int                   m_league_end ;
-  int                   m_league_size ;
-
-  int                   m_chunk_size;
-  int                   m_league_chunk_end;
-  Impl::OpenMPexec    & m_team_lead_exec ;
-  int                   m_invalid_thread;
-  int                   m_team_alloc;
-
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
-        memory_fence();
-      }
-    }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_shmem() const
-    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    {
-      if ( 1 < m_team_size && !m_invalid_thread) {
-        team_fan_in();
-        team_fan_out();
-      }
-    }
-#endif
-
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(ValueType& value, const int& thread_id) const
-  {
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { }
-#else
-    // Make sure there is enough scratch space:
-    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
-                         , ValueType , void >::type type ;
-
-    type volatile * const shared_value =
-      ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-    if ( team_rank() == thread_id ) *shared_value = value;
-    memory_fence();
-    team_barrier(); // Wait for 'thread_id' to write
-    value = *shared_value ;
-    team_barrier(); // Wait for team members to read
-#endif
-  }
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION ValueType
-    team_reduce( const ValueType & value
-               , const JoinOp & op_in ) const
-  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ValueType(); }
-  #else
-    {
-      memory_fence();
-      typedef ValueType value_type;
-      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
-  #endif
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
-                           , value_type , void >::type type ;
-
-      type * const local_value = ((type*) m_exec.scratch_thread());
-
-      // Set this thread's contribution
-      *local_value = value ;
-
-      // Fence to make sure the base team member has access:
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-        // Join to the team value:
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
-        }
-        memory_fence();
-
-        // The base team member may "lap" the other team members,
-        // copy to their local value before proceeding.
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
-        }
-
-        // Fence to make sure all team members have access
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *((type volatile const *)local_value);
-    }
-#endif
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename ArgType >
-  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ArgType(); }
-#else
-    {
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
-
-      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
-
-      *work_value = value ;
-
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        // m_team_base[0]                 == highest ranking team member
-        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
-        //
-        // 1) copy from lower to higher rank, initialize lowest rank to zero
-        // 2) prefix sum from lowest to highest rank, skipping lowest rank
-
-        type accum = 0 ;
-
-        if ( global_accum ) {
-          for ( int i = m_team_size ; i-- ; ) {
-            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-            accum += val ;
-          }
-          accum = atomic_fetch_add( global_accum , accum );
-        }
-
-        for ( int i = m_team_size ; i-- ; ) {
-          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-          const type offset = accum ;
-          accum += val ;
-          val = offset ;
-        }
-
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *work_value ;
-    }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-    { return this-> template team_scan<Type>( value , 0 ); }
-
-  //----------------------------------------
-  // Private for the driver
-
-private:
-
-  typedef execution_space::scratch_memory_space space ;
-
-public:
-
-  template< class ... Properties >
-  inline
-  OpenMPexecTeamMember( Impl::OpenMPexec & exec
-                      , const TeamPolicyInternal< OpenMP, Properties ...> & team
-                      , const int shmem_size_L1
-                      , const int shmem_size_L2
-                      )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
-    , m_team_base_rev(0)
-    , m_team_rank_rev(0)
-    , m_team_rank(0)
-    , m_team_size( team.team_size() )
-    , m_league_rank(0)
-    , m_league_end(0)
-    , m_league_size( team.league_size() )
-    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
-    , m_league_chunk_end(0)
-    , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
-    , m_team_alloc( team.team_alloc())
-    {
-      const int pool_rank_rev        = m_exec.pool_rank_rev();
-      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
-      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
-      const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
-      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
-            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
-            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
-      if (league_iter_begin < 0)     league_iter_begin = 0;
-      if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
-
-      if ((team.team_alloc()>m_team_size)?
-          (pool_team_rank_rev >= m_team_size):
-          (m_exec.pool_size() - pool_num_teams*m_team_size > m_exec.pool_rank())
-         )
-        m_invalid_thread = 1;
-      else
-        m_invalid_thread = 0;
-
-      m_team_rank_rev  = pool_team_rank_rev ;
-      if ( pool_team_rank_rev < m_team_size && !m_invalid_thread ) {
-        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
-        m_team_rank_rev  = pool_team_rank_rev ;
-        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
-        m_league_end     = league_iter_end ;
-        m_league_rank    = league_iter_begin ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                               0 );
-      }
-
-      if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
-        m_exec.set_work_range(m_league_rank,m_league_end,m_chunk_size);
-        m_exec.reset_steal_target(m_team_size);
-      }
-    }
-
-  bool valid_static() const
-    {
-      return m_league_rank < m_league_end ;
-    }
-
-  void next_static()
-    {
-      if ( m_league_rank < m_league_end ) {
-        team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                               0);
-      }
-      m_league_rank++;
-    }
-
-  bool valid_dynamic() {
-    if(m_invalid_thread)
-      return false;
-    if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
-      return true;
-    }
-
-    if (  m_team_rank_rev == 0 ) {
-      m_team_lead_exec.get_work_index(m_team_alloc);
-    }
-    team_barrier();
-
-    long work_index = m_team_lead_exec.team_work_index();
-
-    m_league_rank = work_index * m_chunk_size;
-    m_league_chunk_end = (work_index +1 ) * m_chunk_size;
-
-    if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
-
-    if(m_league_rank>=0)
-      return true;
-    return false;
-  }
-
-  void next_dynamic() {
-    if(m_invalid_thread)
-      return;
-
-    if ( m_league_rank < m_league_chunk_end ) {
-      team_barrier();
-      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                             0);
-    }
-    m_league_rank++;
-  }
-
-  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
-};
-
 template< class ... Properties >
 class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
 {
@@ -671,8 +157,11 @@ public:
 
   template< class FunctorType >
   inline static
-  int team_size_max( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(1); }
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
 
   template< class FunctorType >
   inline static
@@ -702,7 +191,8 @@ private:
                   , const int team_size_request )
     {
       const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
       const int team_grain = traits::execution_space::thread_pool_size(2);
 
       m_league_size = league_size_request ;
@@ -823,7 +313,7 @@ private:
   }
 
 public:
-  typedef Impl::OpenMPexecTeamMember member_type ;
+  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
 };
 } // namespace Impl
 
@@ -850,216 +340,6 @@ int OpenMP::thread_pool_rank()
 #endif
 }
 
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >
-TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType& count ) {
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, count );
-}
-
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::OpenMPexecTeamMember >
-TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType1& begin, const iType2& end ) {
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, iType(begin), iType(end) );
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
-ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
 } // namespace Kokkos
 
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,join);
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  if(single_struct.team_member.team_rank()==0) lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  if(single_struct.team_member.team_rank()==0) {
-    lambda(val);
-  }
-  single_struct.team_member.team_broadcast(val,0);
-}
-}
-
 #endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
deleted file mode 100644
index b4df5e35bb7897b7e7bdf76acb4f2bc4d9a9fe77..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_ENABLE_QTHREAD )
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <utility>
-#include <Kokkos_Qthread.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-// Defines to enable experimental Qthread functionality
-
-#define QTHREAD_LOCAL_PRIORITY
-#define CLONED_TASKS
-
-#include <qthread/qthread.h>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
-
-/** s_exec is indexed by the reverse rank of the workers
- *  for faster fan-in / fan-out lookups
- *  [ n - 1 , n - 2 , ... , 0 ]
- */
-QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
-
-int  s_number_shepherds            = 0 ;
-int  s_number_workers_per_shepherd = 0 ;
-int  s_number_workers              = 0 ;
-
-inline
-QthreadExec ** worker_exec()
-{
-  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
-}
-
-const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
-
-int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
-int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
-int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
-
-QthreadExecFunctionPointer volatile s_active_function = 0 ;
-const void               * volatile s_active_function_arg = 0 ;
-
-} /* namespace */
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-int Qthread::is_initialized()
-{
-  return Impl::s_number_workers != 0 ;
-}
-
-int Qthread::concurrency()
-{
-  return Impl::s_number_workers_per_shepherd ;
-}
-
-int Qthread::in_parallel()
-{
-  return Impl::s_active_function != 0 ;
-}
-
-void Qthread::initialize( int thread_count )
-{
-  // Environment variable: QTHREAD_NUM_SHEPHERDS
-  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
-  // Environment variable: QTHREAD_HWPAR
-
-  {
-    char buffer[256];
-    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
-    putenv(buffer);
-  }
-
-  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
-                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
-                       ( thread_count    == qthread_num_workers() );
-
-  bool ok_symmetry = true ;
-
-  if ( ok_init ) {
-    Impl::s_number_shepherds            = qthread_num_shepherds();
-    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
-    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
-
-    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
-      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
-    }
-  }
-
-  if ( ! ok_init || ! ok_symmetry ) {
-    std::ostringstream msg ;
-
-    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
-    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
-    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
-    msg << " : qthread_num_workers = " << qthread_num_workers();
-
-    if ( ! ok_symmetry ) {
-      msg << " : qthread_num_workers_local = {" ;
-      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
-        msg << " " << qthread_num_workers_local(i) ;
-      }
-      msg << " }" ;
-    }
-
-    Impl::s_number_workers   = 0 ;
-    Impl::s_number_shepherds = 0 ;
-    Impl::s_number_workers_per_shepherd = 0 ;
-
-    if ( ok_init ) { qthread_finalize(); }
-
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
-
-  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-}
-
-void Qthread::finalize()
-{
-  Impl::QthreadExec::clear_workers();
-
-  if ( Impl::s_number_workers ) {
-    qthread_finalize();
-  }
-
-  Impl::s_number_workers    = 0 ;
-  Impl::s_number_shepherds  = 0 ;
-  Impl::s_number_workers_per_shepherd = 0 ;
-}
-
-void Qthread::print_configuration( std::ostream & s , const bool detail )
-{
-  s << "Kokkos::Qthread {"
-    << " num_shepherds(" << Impl::s_number_shepherds << ")"
-    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
-    << " }" << std::endl ;
-}
-
-Qthread & Qthread::instance( int )
-{
-  static Qthread q ;
-  return q ;
-}
-
-void Qthread::fence()
-{
-}
-
-int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
-int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-aligned_t driver_exec_all( void * arg )
-{
-  QthreadExec & exec = **worker_exec();
-
-  (*s_active_function)( exec , s_active_function_arg );
-
-/*
-  fprintf( stdout
-         , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
-         , exec.worker_rank()
-         , exec.worker_size()
-         , exec.shepherd_rank()
-         , exec.shepherd_size()
-         , exec.shepherd_worker_rank()
-         , exec.shepherd_worker_size()
-         );
-  fflush(stdout);
-*/
-
-  return 0 ;
-}
-
-aligned_t driver_resize_worker_scratch( void * arg )
-{
-  static volatile int lock_begin = 0 ;
-  static volatile int lock_end   = 0 ;
-
-  QthreadExec ** const exec = worker_exec();
-
-  //----------------------------------------
-  // Serialize allocation for thread safety
-
-  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
-
-  const bool ok = 0 == *exec ;
-
-  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
-
-  lock_begin = 0 ; // release lock
-
-  if ( ok ) { new( *exec ) QthreadExec(); }
-
-  //----------------------------------------
-  // Wait for all calls to complete to insure that each worker has executed.
-
-  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
-
-  while ( lock_end );
-
-/*
-  fprintf( stdout
-         , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
-         , (**exec).worker_rank()
-         , (**exec).worker_size()
-         , (**exec).shepherd_rank()
-         , (**exec).shepherd_size()
-         , (**exec).shepherd_worker_rank()
-         , (**exec).shepherd_worker_size()
-         );
-  fflush(stdout);
-*/
-
-  //----------------------------------------
-
-  if ( ! ok ) {
-    fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
-    fflush( stderr );
-  }
-
-  return 0 ;
-}
-
-void verify_is_process( const char * const label , bool not_active = false )
-{
-  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
-  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
-
-  if ( not_process || is_active ) {
-    std::string msg( label );
-    msg.append( " : FAILED" );
-    if ( not_process ) msg.append(" : not called by main process");
-    if ( is_active )   msg.append(" : parallel execution in progress");
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-}
-
-int QthreadExec::worker_per_shepherd()
-{
-  return s_number_workers_per_shepherd ;
-}
-
-QthreadExec::QthreadExec()
-{
-  const int shepherd_rank        = qthread_shep();
-  const int shepherd_worker_rank = qthread_worker_local(NULL);
-  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
-
-  m_worker_base          = s_exec ;
-  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
-  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
-  m_reduce_end           = s_worker_reduce_end ;
-  m_shepherd_rank        = shepherd_rank ;
-  m_shepherd_size        = s_number_shepherds ;
-  m_shepherd_worker_rank = shepherd_worker_rank ;
-  m_shepherd_worker_size = s_number_workers_per_shepherd ;
-  m_worker_rank          = worker_rank ;
-  m_worker_size          = s_number_workers ;
-  m_worker_state         = QthreadExec::Active ;
-}
-
-void QthreadExec::clear_workers()
-{
-  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-    QthreadExec * const exec = s_exec[iwork] ;
-    s_exec[iwork] = 0 ;
-    free( exec );
-  }
-}
-
-void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
-{
-  new( & space )
-    Qthread::scratch_memory_space(
-      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
-      s_worker_shared_end - s_worker_shared_begin
-    );
-}
-
-void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
-{
-  const int exec_all_reduce_alloc = align_alloc( reduce_size );
-  const int shepherd_scan_alloc   = align_alloc( 8 );
-  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
-
-  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
-       s_worker_shared_end < shepherd_shared_end ) {
-
-/*
-  fprintf( stdout , "QthreadExec::resize\n");
-  fflush(stdout);
-*/
-
-    // Clear current worker memory before allocating new worker memory
-    clear_workers();
-
-    // Increase the buffers to an aligned allocation
-    s_worker_reduce_end   = exec_all_reduce_alloc ;
-    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
-    s_worker_shared_end   = shepherd_shared_end ;
-
-    // Need to query which shepherd this main 'process' is running...
- 
-    const int main_shep = qthread_shep();
-
-    // Have each worker resize its memory for proper first-touch
-#if 0
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
-      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
-    }}
-#else
-    // If this function is used before the 'qthread.task_policy' unit test
-    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
-
-      if ( num_clone ) {
-        const int ret = qthread_fork_clones_to_local_priority
-          ( driver_resize_worker_scratch   /* function */
-          , NULL                           /* function data block */
-          , NULL                           /* pointer to return value feb */
-          , jshep                          /* shepherd number */
-          , num_clone - 1                  /* number of instances - 1 */
-          );
-
-        assert(ret == QTHREAD_SUCCESS);
-      }
-    }
-#endif
-
-    driver_resize_worker_scratch( NULL );
-
-    // Verify all workers allocated
-
-    bool ok = true ;
-    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
-
-    if ( ! ok ) {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
-      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
-      }
-      msg << " }" ;
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-  }
-}
-
-void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
-{
-  verify_is_process("QthreadExec::exec_all(...)",true);
-
-/*
-  fprintf( stdout , "QthreadExec::exec_all\n");
-  fflush(stdout);
-*/
-
-  s_active_function     = func ;
-  s_active_function_arg = arg ;
-
-  // Need to query which shepherd this main 'process' is running...
- 
-  const int main_shep = qthread_shep();
-
-#if 0
-  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
-    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
-  }}
-#else
-  // If this function is used before the 'qthread.task_policy' unit test
-  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
-  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
-
-    if ( num_clone ) {
-      const int ret = qthread_fork_clones_to_local_priority
-        ( driver_exec_all   /* function */
-        , NULL              /* function data block */
-        , NULL              /* pointer to return value feb */
-        , jshep             /* shepherd number */
-        , num_clone - 1     /* number of instances - 1 */
-        );
-
-      assert(ret == QTHREAD_SUCCESS);
-    }
-  }
-#endif
-
-  driver_exec_all( NULL );
-
-  s_active_function     = 0 ;
-  s_active_function_arg = 0 ;
-}
-
-void * QthreadExec::exec_all_reduce_result()
-{
-  return s_exec[0]->m_scratch_alloc ;
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
-QthreadTeamPolicyMember::QthreadTeamPolicyMember()
-  : m_exec( **worker_exec() )
-  , m_team_shared(0,0)
-  , m_team_size( 1 )
-  , m_team_rank( 0 )
-  , m_league_size(1)
-  , m_league_end(1)
-  , m_league_rank(0)
-{
-  m_exec.shared_reset( m_team_shared );
-}
-
-QthreadTeamPolicyMember::QthreadTeamPolicyMember( const QthreadTeamPolicyMember::TaskTeam & )
-  : m_exec( **worker_exec() )
-  , m_team_shared(0,0)
-  , m_team_size( s_number_workers_per_shepherd )
-  , m_team_rank( m_exec.shepherd_worker_rank() )
-  , m_league_size(1)
-  , m_league_end(1)
-  , m_league_rank(0)
-{
-  m_exec.shared_reset( m_team_shared );
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
-
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
deleted file mode 100644
index f948eb2903b631e82727e670e84339383d5891c9..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_QTHREADEXEC_HPP
-#define KOKKOS_QTHREADEXEC_HPP
-
-#include <impl/Kokkos_spinwait.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-class QthreadExec ;
-
-typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
-
-class QthreadExec {
-private:
-
-  enum { Inactive = 0 , Active = 1 };
-
-  const QthreadExec * const * m_worker_base ;
-  const QthreadExec * const * m_shepherd_base ;
-
-  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
-  int     m_reduce_end ;     ///< End of scratch reduction memory
-
-  int     m_shepherd_rank ;
-  int     m_shepherd_size ;
-
-  int     m_shepherd_worker_rank ;
-  int     m_shepherd_worker_size ;
-
-  /*
-   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
-   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
-   */
-  int     m_worker_rank ;
-  int     m_worker_size ;
-
-  int mutable volatile m_worker_state ;
-
-
-  friend class Kokkos::Qthread ;
-
-  ~QthreadExec();
-  QthreadExec( const QthreadExec & );
-  QthreadExec & operator = ( const QthreadExec & );
-
-public:
-
-  QthreadExec();
-
-  /** Execute the input function on all available Qthread workers */
-  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
-
-  //----------------------------------------
-  /** Barrier across all workers participating in the 'exec_all' */
-  void exec_all_barrier() const
-    {
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  /** Barrier across workers within the shepherd with rank < team_rank */
-  void shepherd_barrier( const int team_size ) const
-    {
-      if ( m_shepherd_worker_rank < team_size ) {
-
-        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-        int n , j ;
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-        }
-
-        if ( rev_rank ) {
-          m_worker_state = QthreadExec::Inactive ;
-          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-        }
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-        }
-      }
-    }
-
-  //----------------------------------------
-  /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ReducerType , class ArgTag >
-  inline
-  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
-    {
-      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
-      typedef typename ReducerConditional::type ReducerTypeFwd;
-      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        const QthreadExec & fan = *m_worker_base[j];
-
-        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
-
-        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-  /** Scall across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
-  inline
-  void exec_all_scan( const FunctorType & func ) const
-    {
-      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
-      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
-      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_worker_base[0] is the
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-        for ( int i = 1 ; i < m_worker_size ; ++i ) {
-          ValueOps::copy( func
-                        , m_worker_base[i-1]->m_scratch_alloc
-                        , m_worker_base[i]->m_scratch_alloc
-                        );
-        }
-
-        ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
-
-        // Join from lower ranking to higher ranking worker.
-        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
-        for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
-          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
-        }
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-
-  template< class Type>
-  inline
-  volatile Type * shepherd_team_scratch_value() const
-    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
-
-  template< class Type >
-  inline
-  void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
-    {
-      if ( m_shepherd_base ) {
-        Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
-        memory_fence();
-        shepherd_barrier( team_size );
-        value = *shared_value ;
-      }
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_reduce( const int team_size , const Type & value ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < n ; ++i ) {
-          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-        for ( int i = 1 ; i < n ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class JoinOp >
-  inline
-  typename JoinOp::value_type
-    shepherd_reduce( const int team_size
-                   , const typename JoinOp::value_type & value
-                   , const JoinOp & op ) const
-    {
-      typedef typename JoinOp::value_type Type ;
-
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
-        }
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_scan( const int team_size
-                    , const Type & value
-                    ,       Type * const global_value = 0 ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_shepherd_base[0] is the
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-
-        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-          accum += tmp ;
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
-        }
-
-        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
-          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
-
-        // Join from lower ranking to higher ranking worker.
-        for ( int i = team_size ; --i ; ) {
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  //----------------------------------------
-
-  static inline
-  int align_alloc( int size )
-    {
-      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
-      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
-      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
-    }
-
-  void shared_reset( Qthread::scratch_memory_space & );
-
-  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
-
-  static void * exec_all_reduce_result();
-
-  static void resize_worker_scratch( const int reduce_size , const int shared_size );
-  static void clear_workers();
-
-  //----------------------------------------
-
-  inline int worker_rank() const { return m_worker_rank ; }
-  inline int worker_size() const { return m_worker_size ; }
-  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
-  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
-  inline int shepherd_rank() const { return m_shepherd_rank ; }
-  inline int shepherd_size() const { return m_shepherd_size ; }
-
-  static int worker_per_shepherd();
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class QthreadTeamPolicyMember {
-private:
-
-  typedef Kokkos::Qthread                        execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-
-        Impl::QthreadExec   & m_exec ;
-  scratch_memory_space        m_team_shared ;
-  const int                   m_team_size ;
-  const int                   m_team_rank ;
-  const int                   m_league_size ;
-  const int                   m_league_end ;
-        int                   m_league_rank ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    { m_exec.shepherd_barrier( m_team_size ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
-#endif
-
-  template< typename JoinOp >
-  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
-    team_reduce( const typename JoinOp::value_type & value
-               , const JoinOp & op ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return typename JoinOp::value_type(); }
-#else
-    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
-#endif
-
-  //----------------------------------------
-  // Private driver for task-team parallel
-
-  struct TaskTeam {};
-
-  QthreadTeamPolicyMember();
-  explicit QthreadTeamPolicyMember( const TaskTeam & );
-
-  //----------------------------------------
-  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
-
-  // Initialize
-  template< class ... Properties >
-  QthreadTeamPolicyMember( Impl::QthreadExec & exec
-                         , const Kokkos::Impl::TeamPolicyInternal<Qthread,Properties...> & team )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_size(   team.m_team_size )
-    , m_team_rank(   exec.shepherd_worker_rank() )
-    , m_league_size( team.m_league_size )
-    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
-    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
-  {
-    m_exec.shared_reset( m_team_shared );
-  }
-
-  // Continue
-  operator bool () const { return m_league_rank < m_league_end ; }
-
-  // iterate
-  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
-};
-
-
-template< class ... Properties >
-class TeamPolicyInternal< Kokkos::Qthread , Properties ... >
-  : public PolicyTraits< Properties... >
-{
-private:
-
-  const int m_league_size ;
-  const int m_team_size ;
-  const int m_shepherd_iter ;
-
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicyInternal  execution_policy ;
-  typedef Qthread             execution_space ;
-  typedef PolicyTraits< Properties ... >  traits ;
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & )
-    { return Qthread::instance().shepherd_worker_size(); }
-
-  template< class FunctorType >
-  static int team_size_recommended( const FunctorType & f )
-    { return team_size_max( f ); }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType & f , const int& )
-    { return team_size_max( f ); }
-
-  //----------------------------------------
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  // One active team per shepherd
-  TeamPolicyInternal( Kokkos::Qthread & q
-                    , const int league_size
-                    , const int team_size
-                    , const int /* vector_length */ = 0
-                    )
-    : m_league_size( league_size )
-    , m_team_size( team_size < q.shepherd_worker_size()
-                 ? team_size : q.shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
-    {
-    }
-
-  // One active team per shepherd
-  TeamPolicyInternal( const int league_size
-                    , const int team_size
-                    , const int /* vector_length */ = 0
-                    )
-    : m_league_size( league_size )
-    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
-                 ? team_size : Qthread::instance().shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
-    {
-    }
-
-  typedef Impl::QthreadTeamPolicyMember member_type ;
-
-  friend class Impl::QthreadTeamPolicyMember ;
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_QTHREADEXEC_HPP */
-
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b92494084c10763ad60ba458888204bd2bd77a3
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
@@ -0,0 +1,519 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+
+#include <Kokkos_Qthreads.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthreads functionality.
+//#define QTHREAD_LOCAL_PRIORITY
+//#define CLONED_TASKS
+
+//#include <qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+enum { MAXIMUM_QTHREADS_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1, n - 2, ..., 0 ]
+ */
+QthreadsExec * s_exec[ MAXIMUM_QTHREADS_WORKERS ];
+
+int  s_number_shepherds            = 0;
+int  s_number_workers_per_shepherd = 0;
+int  s_number_workers              = 0;
+
+inline
+QthreadsExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local( NULL ) + 1 );
+}
+
+const int s_base_size = QthreadsExec::align_alloc( sizeof(QthreadsExec) );
+
+int s_worker_reduce_end   = 0;  // End of worker reduction memory.
+int s_worker_shared_end   = 0;  // Total of worker scratch memory.
+int s_worker_shared_begin = 0;  // Beginning of worker shared memory.
+
+QthreadsExecFunctionPointer volatile s_active_function     = 0;
+const void                * volatile s_active_function_arg = 0;
+
+} // namespace
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Qthreads::is_initialized()
+{
+  return Impl::s_number_workers != 0;
+}
+
+int Qthreads::concurrency()
+{
+  return Impl::s_number_workers_per_shepherd;
+}
+
+int Qthreads::in_parallel()
+{
+  return Impl::s_active_function != 0;
+}
+
+void Qthreads::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf( buffer, sizeof(buffer), "QTHREAD_HWPAR=%d", thread_count );
+    putenv( buffer );
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local( NO_SHEPHERD ) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local( NO_SHEPHERD );
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd;
+
+    for ( int i = 0; ok_symmetry && i < Impl::s_number_shepherds; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local( i ) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg;
+
+    msg << "Kokkos::Qthreads::initialize(" << thread_count << ") FAILED";
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local( NO_SHEPHERD );
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {";
+      for ( int i = 0; i < Impl::s_number_shepherds; ++i ) {
+        msg << " " << qthread_num_workers_local( i );
+      }
+      msg << " }";
+    }
+
+    Impl::s_number_workers              = 0;
+    Impl::s_number_shepherds            = 0;
+    Impl::s_number_workers_per_shepherd = 0;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadsExec::resize_worker_scratch( 256, 256 );
+
+  // Init the array for used for arbitrarily sized atomics.
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthreads::finalize()
+{
+  Impl::QthreadsExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers              = 0;
+  Impl::s_number_shepherds            = 0;
+  Impl::s_number_workers_per_shepherd = 0;
+}
+
+void Qthreads::print_configuration( std::ostream & s, const bool detail )
+{
+  s << "Kokkos::Qthreads {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl;
+}
+
+Qthreads & Qthreads::instance( int )
+{
+  static Qthreads q;
+  return q;
+}
+
+void Qthreads::fence()
+{
+}
+
+int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
+int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadsExec & exec = **worker_exec();
+
+  (*s_active_function)( exec, s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0;
+  static volatile int lock_end   = 0;
+
+  QthreadsExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety.
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin, 0, 1 ) ); // Spin wait to claim lock.
+
+  const bool ok = 0 == *exec;
+
+  if ( ok ) { *exec = (QthreadsExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0; // Release lock.
+
+  if ( ok ) { new( *exec ) QthreadsExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end, 1 ) ) { lock_end = 0; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr, "Kokkos::QthreadsExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0;
+}
+
+void verify_is_process( const char * const label, bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local( NULL );
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+} // namespace
+
+int QthreadsExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd;
+}
+
+QthreadsExec::QthreadsExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local( NULL );
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank;
+
+  m_worker_base          = s_exec;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size;
+  m_reduce_end           = s_worker_reduce_end;
+  m_shepherd_rank        = shepherd_rank;
+  m_shepherd_size        = s_number_shepherds;
+  m_shepherd_worker_rank = shepherd_worker_rank;
+  m_shepherd_worker_size = s_number_workers_per_shepherd;
+  m_worker_rank          = worker_rank;
+  m_worker_size          = s_number_workers;
+  m_worker_state         = QthreadsExec::Active;
+}
+
+void QthreadsExec::clear_workers()
+{
+  for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+    QthreadsExec * const exec = s_exec[iwork];
+    s_exec[iwork] = 0;
+    free( exec );
+  }
+}
+
+void QthreadsExec::shared_reset( Qthreads::scratch_memory_space & space )
+{
+  new( & space )
+    Qthreads::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadsExec::resize_worker_scratch( const int reduce_size, const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout, "QthreadsExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory.
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation.
+    s_worker_reduce_end   = exec_all_reduce_alloc;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc;
+    s_worker_shared_end   = shepherd_shared_end;
+
+    // Need to query which shepherd this main 'process' is running.
+
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch.
+#if 0
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i ) {
+        qthread_fork_to( driver_resize_worker_scratch, NULL, NULL, jshep );
+      }
+    }
+#else
+    // If this function is used before the 'qthreads.task_policy' unit test,
+    // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   // Function
+          , NULL                           // Function data block
+          , NULL                           // Pointer to return value feb
+          , jshep                          // Shepherd number
+          , num_clone - 1                  // Number of instances - 1
+          );
+
+        assert( ret == QTHREAD_SUCCESS );
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated.
+
+    bool ok = true;
+    for ( int iwork = 0; ok && iwork < s_number_workers; ++iwork ) { ok = 0 != s_exec[iwork]; }
+
+    if ( ! ok ) {
+      std::ostringstream msg;
+      msg << "Kokkos::Impl::QthreadsExec::resize : FAILED for workers {";
+      for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }";
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadsExec::exec_all( Qthreads &, QthreadsExecFunctionPointer func, const void * arg )
+{
+  verify_is_process("QthreadsExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout, "QthreadsExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func;
+  s_active_function_arg = arg;
+
+  // Need to query which shepherd this main 'process' is running.
+
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0, iwork = 0; jshep < s_number_shepherds; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i, ++iwork ) {
+      qthread_fork_to( driver_exec_all, NULL, NULL, jshep );
+    }
+  }
+#else
+  // If this function is used before the 'qthreads.task_policy' unit test,
+  // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   // Function
+        , NULL              // Function data block
+        , NULL              // Pointer to return value feb
+        , jshep             // Shepherd number
+        , num_clone - 1     // Number of instances - 1
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0;
+  s_active_function_arg = 0;
+}
+
+void * QthreadsExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc;
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( 1 )
+  , m_team_rank( 0 )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMember::TaskTeam & )
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( s_number_workers_per_shepherd )
+  , m_team_rank( m_exec.shepherd_worker_rank() )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..64856eb99e014272fd92f638e2d7f312d3039120
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@@ -0,0 +1,640 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADSEXEC_HPP
+#define KOKKOS_QTHREADSEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsExec;
+
+typedef void (*QthreadsExecFunctionPointer)( QthreadsExec &, const void * );
+
+class QthreadsExec {
+private:
+  enum { Inactive = 0, Active = 1 };
+
+  const QthreadsExec * const * m_worker_base;
+  const QthreadsExec * const * m_shepherd_base;
+
+  void  * m_scratch_alloc;  ///< Scratch memory [ reduce, team, shared ]
+  int     m_reduce_end;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank;
+  int     m_shepherd_size;
+
+  int     m_shepherd_worker_rank;
+  int     m_shepherd_worker_size;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank;
+  int     m_worker_size;
+
+  int mutable volatile m_worker_state;
+
+  friend class Kokkos::Qthreads;
+
+  ~QthreadsExec();
+  QthreadsExec( const QthreadsExec & );
+  QthreadsExec & operator = ( const QthreadsExec & );
+
+public:
+  QthreadsExec();
+
+  /** Execute the input function on all available Qthreads workers. */
+  static void exec_all( Qthreads &, QthreadsExecFunctionPointer, const void * );
+
+  /** Barrier across all workers participating in the 'exec_all'. */
+  void exec_all_barrier() const
+  {
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Barrier across workers within the shepherd with rank < team_rank. */
+  void shepherd_barrier( const int team_size ) const
+  {
+    if ( m_shepherd_worker_rank < team_size ) {
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n, j;
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadsExec::Inactive;
+        Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+      }
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+      }
+    }
+  }
+
+  /** Reduce across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ReducerType, class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
+  {
+    typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+    typedef typename ReducerConditional::type ReducerTypeFwd;
+    typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      const QthreadsExec & fan = *m_worker_base[j];
+
+      Impl::spinwait_while_equal( fan.m_worker_state, QthreadsExec::Active );
+
+      ValueJoin::join( ReducerConditional::select( func, reduce ), m_scratch_alloc, fan.m_scratch_alloc );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Scan across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+  {
+    typedef Kokkos::Impl::FunctorValueInit< FunctorType, ArgTag > ValueInit;
+    typedef Kokkos::Impl::FunctorValueJoin< FunctorType, ArgTag > ValueJoin;
+    typedef Kokkos::Impl::FunctorValueOps<  FunctorType, ArgTag > ValueOps;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_worker_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+      for ( int i = 1; i < m_worker_size; ++i ) {
+        ValueOps::copy( func
+                      , m_worker_base[i-1]->m_scratch_alloc
+                      , m_worker_base[i]->m_scratch_alloc
+                      );
+      }
+
+      ValueInit::init( func, m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+      // Join from lower ranking to higher ranking worker.
+      // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+      for ( int i = m_worker_size - 1; --i > 0; ) {
+        ValueJoin::join( func, m_worker_base[i-1]->m_scratch_alloc, m_worker_base[i]->m_scratch_alloc );
+      }
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  //----------------------------------------
+
+  template< class Type >
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+  { return (volatile Type*)( ( (unsigned char *) m_scratch_alloc ) + m_reduce_end ); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value, const int team_size, const int team_rank ) const
+  {
+    if ( m_shepherd_base ) {
+      Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value; }
+      memory_fence();
+      shepherd_barrier( team_size );
+      value = *shared_value;
+    }
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size, const Type & value ) const
+  {
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < n; ++i ) {
+        accum += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+      for ( int i = 1; i < n; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+  shepherd_reduce( const int team_size
+                 , const typename JoinOp::value_type & value
+                 , const JoinOp & op ) const
+  {
+    typedef typename JoinOp::value_type Type;
+
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      volatile Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        op.join( accum, *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+      }
+      for ( int i = 1; i < team_size; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+  {
+    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_shepherd_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+
+      Type accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        const Type tmp = *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        accum += tmp;
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp;
+      }
+
+      *m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+        global_value ? atomic_fetch_add( global_value, accum ) : 0;
+
+      // Join from lower ranking to higher ranking worker.
+      for ( int i = team_size; --i; ) {
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+  {
+    enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */ };
+    enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+    return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK;
+  }
+
+  void shared_reset( Qthreads::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size, const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank; }
+  inline int worker_size() const { return m_worker_size; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size; }
+  inline int shepherd_rank() const { return m_shepherd_rank; }
+  inline int shepherd_size() const { return m_shepherd_size; }
+
+  static int worker_per_shepherd();
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsTeamPolicyMember {
+private:
+  typedef Kokkos::Qthreads                       execution_space;
+  typedef execution_space::scratch_memory_space  scratch_memory_space;
+
+  Impl::QthreadsExec   & m_exec;
+  scratch_memory_space   m_team_shared;
+  const int              m_team_size;
+  const int              m_team_rank;
+  const int              m_league_size;
+  const int              m_league_end;
+        int              m_league_rank;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  {}
+#else
+  { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value, int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_broadcast<Type>( value, m_team_size, rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_reduce<Type>( m_team_size, value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+  team_reduce( const typename JoinOp::value_type & value
+             , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return typename JoinOp::value_type(); }
+#else
+  { return m_exec.template shepherd_reduce<JoinOp>( m_team_size, value, op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the league's
+   *  parallel execution, be the scan's total.  Parallel execution ordering of
+   *  the league's teams is non-deterministic.  As such the base value for each
+   *  team's scan operation is similarly non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value, Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value, global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel.
+
+  struct TaskTeam {};
+
+  QthreadsTeamPolicyMember();
+  explicit QthreadsTeamPolicyMember( const TaskTeam & );
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i( exec, team ); i; i.next_team() ) { ... }
+
+  // Initialize.
+  template< class ... Properties >
+  QthreadsTeamPolicyMember( Impl::QthreadsExec & exec
+                          , const Kokkos::Impl::TeamPolicyInternal< Qthreads, Properties... > & team )
+    : m_exec( exec )
+    , m_team_shared( 0, 0 )
+    , m_team_size( team.m_team_size )
+    , m_team_rank( exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue.
+  operator bool () const { return m_league_rank < m_league_end; }
+
+  // Iterate.
+  void next_team() { ++m_league_rank; m_exec.shared_reset( m_team_shared ); }
+};
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Qthreads, Properties ... >
+  : public PolicyTraits< Properties... >
+{
+private:
+  const int m_league_size;
+  const int m_team_size;
+  const int m_shepherd_iter;
+
+public:
+  //! Tag this class as a kokkos execution policy.
+  typedef TeamPolicyInternal              execution_policy;
+  typedef Qthreads                        execution_space;
+  typedef PolicyTraits< Properties ... >  traits;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+  { return Qthreads::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+  { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f, const int& )
+  { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthreads::instance().shepherd_worker_size()
+                 ? team_size : Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Doesn't do anything yet.  Fix this.
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+//    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  typedef Impl::QthreadsTeamPolicyMember member_type;
+
+  friend class Impl::QthreadsTeamPolicyMember;
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif // #define KOKKOS_QTHREADSEXEC_HPP
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
similarity index 86%
rename from lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
rename to lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
index cb5b18094833a48905293175f6655f08f4596c8c..9f996075403f7cdd06fddfcb60d829dfab64bf0a 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
@@ -41,8 +41,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
-#define KOKKOS_QTHREAD_PARALLEL_HPP
+#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
+#define KOKKOS_QTHREADS_PARALLEL_HPP
 
 #include <vector>
 
@@ -51,7 +51,7 @@
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -63,7 +63,7 @@ namespace Impl {
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
                  , Kokkos::RangePolicy< Traits ... >
-                 , Kokkos::Qthread
+                 , Kokkos::Qthreads
                  >
 {
 private:
@@ -99,7 +99,7 @@ private:
     }
 
   // Function is called once by every concurrent thread.
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelFor & self = * ((const ParallelFor *) arg );
 
@@ -116,7 +116,7 @@ public:
   inline
   void execute() const
     {
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
 
     }
 
@@ -134,7 +134,7 @@ template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ... >
                     , ReducerType
-                    , Kokkos::Qthread
+                    , Kokkos::Qthreads
                     >
 {
 private:
@@ -186,7 +186,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelReduce & self = * ((const ParallelReduce *) arg );
 
@@ -205,10 +205,10 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
 
-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
 
       Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
 
@@ -246,11 +246,11 @@ public:
 template< class FunctorType , class ... Properties >
 class ParallelFor< FunctorType
                  , TeamPolicy< Properties ... >
-                 , Kokkos::Qthread >
+                 , Kokkos::Qthreads >
 {
 private:
 
-  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
 
@@ -282,7 +282,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelFor & self = * ((const ParallelFor *) arg );
 
@@ -297,10 +297,10 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch
+      QthreadsExec::resize_worker_scratch
         ( /* reduction   memory */ 0
         , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
     }
 
   ParallelFor( const FunctorType & arg_functor ,
@@ -316,12 +316,12 @@ template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                     , TeamPolicy< Properties... >
                     , ReducerType
-                    , Kokkos::Qthread
+                    , Kokkos::Qthreads
                     >
 {
 private:
 
-  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
 
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
@@ -365,7 +365,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelReduce & self = * ((const ParallelReduce *) arg );
 
@@ -383,13 +383,13 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch
+      QthreadsExec::resize_worker_scratch
         ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
         , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
 
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
 
-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
 
       Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
 
@@ -429,7 +429,7 @@ public:
 template< class FunctorType , class ... Traits >
 class ParallelScan< FunctorType
                   , Kokkos::RangePolicy< Traits ... >
-                  , Kokkos::Qthread
+                  , Kokkos::Qthreads
                   >
 {
 private:
@@ -474,7 +474,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelScan & self = * ((const ParallelScan *) arg );
 
@@ -497,8 +497,8 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelScan::exec , this );
     }
 
   ParallelScan( const FunctorType & arg_functor
@@ -521,37 +521,37 @@ namespace Kokkos {
 
 template< typename iType >
 KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >
-TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType& count )
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType& count )
 {
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, count );
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, count );
 }
 
 template< typename iType1, typename iType2 >
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::QthreadTeamPolicyMember >
-TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
+                                       Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
 {
   typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, iType(begin), iType(end) );
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, iType(begin), iType(end) );
 }
 
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
-  ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadsTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >(thread,count);
 }
 
 KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember> PerTeam(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
 }
 
 KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
-  return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember> PerThread(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
 }
 
 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
@@ -560,7 +560,7 @@ Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::Qt
  * This functionality requires C++11 support.*/
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
     lambda(i);
 }
@@ -571,7 +571,7 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qthrea
  * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
                      const Lambda & lambda, ValueType& result) {
 
   result = ValueType();
@@ -595,7 +595,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
  * '1 for *'). This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
                      const Lambda & lambda, const JoinType& join, ValueType& init_result) {
 
   ValueType result = init_result;
@@ -615,7 +615,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
  * This functionality requires C++11 support.*/
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
     loop_boundaries, const Lambda& lambda) {
   #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
   #pragma ivdep
@@ -630,7 +630,7 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qthr
  * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
       loop_boundaries, const Lambda & lambda, ValueType& result) {
   result = ValueType();
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -652,7 +652,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
  * '1 for *'). This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
       loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
 
   ValueType result = init_result;
@@ -679,7 +679,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
  * This functionality requires C++11 support.*/
 template< typename iType, class FunctorType >
 KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
       loop_boundaries, const FunctorType & lambda) {
 
   typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
@@ -697,25 +697,25 @@ void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qth
 
 template<class FunctorType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
   lambda();
 }
 
 template<class FunctorType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
   if(single_struct.team_member.team_rank()==0) lambda();
 }
 
 template<class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
   lambda(val);
 }
 
 template<class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
   if(single_struct.team_member.team_rank()==0) {
     lambda(val);
   }
@@ -724,4 +724,4 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& singl
 
 } // namespace Kokkos
 
-#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..614a2c03f03e8c9cfbd15653295a254a350fb25a
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
@@ -0,0 +1,320 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::Qthreads >::TaskExec()
+  : m_self_exec( 0 ),
+    m_team_exec( 0 ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( 0 ),
+    m_team_rank( 0 ),
+    m_team_size( 1 )
+{}
+
+TaskExec< Kokkos::Qthreads >::
+TaskExec( Kokkos::Impl::QthreadsExec & arg_exec, int const arg_team_size )
+  : m_self_exec( & arg_exec ),
+    m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ),
+    m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ),
+    m_team_size( arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::Qthreads >::team_barrier() const
+{
+  if ( 1 < m_team_size ) {
+
+    if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+      Kokkos::abort("TaskQueue<Qthreads> scratch_reduce memory too small");
+    }
+
+    // Use team shared memory to synchronize.
+    // Alternate memory locations between barriers to avoid a sequence
+    // of barriers overtaking one another.
+
+    int64_t volatile * const sync =
+      ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+    // This team member sets one byte within the sync variable
+    int8_t volatile * const sync_self =
+     ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+    while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    ++m_sync_step ;
+
+    if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+      m_sync_value ^= m_sync_mask ;
+      if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::execute
+  ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using PoolExec        = Kokkos::Impl::QthreadsExec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<Qthreads> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self, team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire Qthreads thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      // Each team lead attempts to acquire either a thread team task
+      // or collection of single thread tasks for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task_root_type * tmp =
+          0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == tmp ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == tmp ; ++j ) {
+            tmp = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+
+        *task_shared = tmp ;
+
+        // Fence to be sure shared_task_array is stored
+        Kokkos::memory_fence();
+      }
+
+      // Whole team waits for every team member to reach this statement
+      team_exec.team_barrier();
+
+      Kokkos::memory_fence();
+
+      task_root_type * const task = *task_shared ;
+
+#if 0
+fprintf( stdout,
+         "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n",
+         team_exec.m_group_rank,
+         team_exec.m_team_rank,
+         uintptr_t(task_shared),
+         uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task, & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task );
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task, & single_exec );
+
+          queue->complete( task );
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task, & single_exec );
+
+      queue->complete( task );
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..836452dde93767f172e47d2c19f74498e4dde246
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
+#define KOKKOS_IMPL_QTHREADS_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::Qthreads >
+{
+public:
+
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space,
+                                 typename FunctorType::value_type,
+                                 FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Qthreads >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::QthreadsExec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Qthreads > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Qthreads > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec, int arg_team_size );
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const ;
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
similarity index 91%
rename from lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
rename to lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
index 50444177ceaa46218f9757636d46c8a1a0b339bf..aa159cff6a5211d721a7b6beb31a5969851d080d 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
@@ -41,11 +41,11 @@
 //@HEADER
 */
 
-// Experimental unified task-data parallel manycore LDRD
+// Experimental unified task-data parallel manycore LDRD.
 
 #include <Kokkos_Core_fwd.hpp>
 
-#if defined( KOKKOS_ENABLE_QTHREAD )
+#if defined( KOKKOS_ENABLE_QTHREADS )
 
 #include <stdio.h>
 
@@ -56,17 +56,15 @@
 #include <string>
 
 #include <Kokkos_Atomic.hpp>
-#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+#include <Qthreads/Kokkos_Qthreads_TaskPolicy.hpp>
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-//----------------------------------------------------------------------------
-
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-typedef TaskMember< Kokkos::Qthread , void , void > Task ;
+typedef TaskMember< Kokkos::Qthreads , void , void > Task ;
 
 namespace {
 
@@ -173,16 +171,16 @@ Task::TaskMember( const function_dealloc_type  arg_dealloc
 
 void Task::throw_error_add_dependence() const
 {
-  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
+  std::cerr << "TaskMember< Qthreads >::add_dependence ERROR"
             << " state(" << m_state << ")"
             << " dep_size(" << m_dep_size << ")"
             << std::endl ;
-  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
+  throw std::runtime_error("TaskMember< Qthreads >::add_dependence ERROR");
 }
 
 void Task::throw_error_verify_type()
 {
-  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
+  throw std::runtime_error("TaskMember< Qthreads >::verify_type ERROR");
 }
 
 //----------------------------------------------------------------------------
@@ -190,7 +188,7 @@ void Task::throw_error_verify_type()
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
 void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
 {
-  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthreads>::assign ERROR" ;
   static const char msg_error_count[]       = ": negative reference count" ;
   static const char msg_error_complete[]    = ": destroy task that is not complete" ;
   static const char msg_error_dependences[] = ": destroy task that has dependences" ;
@@ -294,7 +292,7 @@ fflush(stdout);
       assign( & m_dep[i] , 0 );
     }
 
-    // Set qthread FEB to full so that dependent tasks are allowed to execute.
+    // Set Qthreads FEB to full so that dependent tasks are allowed to execute.
     // This 'task' may be deleted immediately following this function call.
     qthread_fill( & m_qfeb );
 
@@ -319,10 +317,10 @@ aligned_t Task::qthread_func( void * arg )
                                         );
 
   if ( task->m_apply_team && ! task->m_apply_single ) {
-    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+    Kokkos::Impl::QthreadsTeamPolicyMember::TaskTeam task_team_tag ;
 
     // Initialize team size and rank with shephered info
-    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
+    Kokkos::Impl::QthreadsTeamPolicyMember member( task_team_tag );
 
     (*task->m_apply_team)( task , member );
 
@@ -344,7 +342,7 @@ fflush(stdout);
   }
   else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
     // Team hard-wired to one, no cloning
-    Kokkos::Impl::QthreadTeamPolicyMember member ;
+    Kokkos::Impl::QthreadsTeamPolicyMember member ;
     (*task->m_apply_team)( task , member );
     task->closeout();
   }
@@ -384,8 +382,8 @@ void Task::schedule()
   // Increment active task count before spawning.
   Kokkos::atomic_increment( m_active_count );
 
-  // spawn in qthread.  must malloc the precondition array and give to qthread.
-  // qthread will eventually free this allocation so memory will not be leaked.
+  // spawn in Qthreads.  must malloc the precondition array and give to Qthreads.
+  // Qthreads will eventually free this allocation so memory will not be leaked.
 
   // concern with thread safety of malloc, does this need to be guarded?
   aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
@@ -393,7 +391,7 @@ void Task::schedule()
   qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
 
   for ( int i = 0 ; i < m_dep_size ; ++i ) {
-    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthreads precondition flag
   }
 
   if ( m_apply_team && ! m_apply_single ) {
@@ -446,7 +444,7 @@ fflush(stdout);
 namespace Kokkos {
 namespace Experimental {
 
-TaskPolicy< Kokkos::Qthread >::
+TaskPolicy< Kokkos::Qthreads >::
 TaskPolicy
   ( const unsigned /* arg_task_max_count */
   , const unsigned /* arg_task_max_size */
@@ -462,7 +460,7 @@ TaskPolicy
 
   if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
     std::ostringstream msg ;
-    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthread >( "
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads >( "
         << "default_depedence = " << arg_task_default_dependence_capacity
         << " , team_size = " << arg_task_team_size
         << " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
@@ -470,14 +468,14 @@ TaskPolicy
   }
 }
 
-TaskPolicy< Kokkos::Qthread >::member_type &
-TaskPolicy< Kokkos::Qthread >::member_single()
+TaskPolicy< Kokkos::Qthreads >::member_type &
+TaskPolicy< Kokkos::Qthreads >::member_single()
 {
   static member_type s ;
   return s ;
 }
 
-void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
 {
   volatile int * const active_task_count = & policy.m_active_count ;
   while ( *active_task_count ) qthread_yield();
@@ -486,6 +484,5 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
 } // namespace Experimental
 } // namespace Kokkos
 
-#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
-
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
similarity index 90%
rename from lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
rename to lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
index 565dbf7e61716717bdbac0e1b3adf007493cf27d..1e5a4dc593cc6de9fff9d2a762b4f864c6c12e9c 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
@@ -43,15 +43,15 @@
 
 // Experimental unified task-data parallel manycore LDRD
 
-#ifndef KOKKOS_QTHREAD_TASKSCHEDULER_HPP
-#define KOKKOS_QTHREAD_TASKSCHEDULER_HPP
+#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
+#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
 
 #include <string>
 #include <typeinfo>
 #include <stdexcept>
 
 //----------------------------------------------------------------------------
-// Defines to enable experimental Qthread functionality
+// Defines to enable experimental Qthreads functionality
 
 #define QTHREAD_LOCAL_PRIORITY
 #define CLONED_TASKS
@@ -63,7 +63,7 @@
 
 //----------------------------------------------------------------------------
 
-#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Qthreads.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_View.hpp>
 
@@ -78,13 +78,13 @@ namespace Experimental {
 namespace Impl {
 
 template<>
-class TaskMember< Kokkos::Qthread , void , void >
+class TaskMember< Kokkos::Qthreads , void , void >
 {
 public:
 
   typedef TaskMember * (* function_verify_type) ( TaskMember * );
   typedef void         (* function_single_type) ( TaskMember * );
-  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadsTeamPolicyMember & );
   typedef void         (* function_dealloc_type)( TaskMember * );
 
 private:
@@ -94,7 +94,7 @@ private:
   const function_single_type   m_apply_single ;  ///< Apply function
   const function_team_type     m_apply_team ;    ///< Apply function
   int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
-  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
+  aligned_t                    m_qfeb ;          ///< Qthreads full/empty bit
   TaskMember ** const          m_dep ;           ///< Dependences
   const int                    m_dep_capacity ;  ///< Capacity of dependences
   int                          m_dep_size ;      ///< Actual count of dependences
@@ -129,7 +129,7 @@ protected :
 
   ~TaskMember();
 
-  // Used by TaskMember< Qthread , ResultType , void >
+  // Used by TaskMember< Qthreads , ResultType , void >
   TaskMember( const function_verify_type   arg_verify
             , const function_dealloc_type  arg_dealloc
             , const function_single_type   arg_apply_single
@@ -139,7 +139,7 @@ protected :
             , const unsigned               arg_dependence_capacity
             );
 
-  // Used for TaskMember< Qthread , void , void >
+  // Used for TaskMember< Qthreads , void , void >
   TaskMember( const function_dealloc_type  arg_dealloc
             , const function_single_type   arg_apply_single
             , const function_team_type     arg_apply_team
@@ -175,15 +175,15 @@ public:
   /*  Inheritence Requirements on task types:
    *    typedef  FunctorType::value_type  value_type ;
    *    class DerivedTaskType
-   *      : public TaskMember< Qthread , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , FunctorType >
    *      { ... };
-   *    class TaskMember< Qthread , value_type , FunctorType >
-   *      : public TaskMember< Qthread , value_type , void >
+   *    class TaskMember< Qthreads , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , void >
    *      , public Functor
    *      { ... };
    *  If value_type != void
-   *    class TaskMember< Qthread , value_type , void >
-   *      : public TaskMember< Qthread , void , void >
+   *    class TaskMember< Qthreads , value_type , void >
+   *      : public TaskMember< Qthreads , void , void >
    *
    *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
    *
@@ -300,10 +300,10 @@ public:
   KOKKOS_INLINE_FUNCTION static
   void apply_single( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
       //   , public FunctorType
       //   { ... };
 
@@ -316,10 +316,10 @@ public:
   KOKKOS_INLINE_FUNCTION static
   void apply_single( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
       //   , public FunctorType
       //   { ... };
 
@@ -333,9 +333,9 @@ public:
   template< class FunctorType , class ResultType >
   KOKKOS_INLINE_FUNCTION static
   void apply_team( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t
-                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
       derived_type & m = * static_cast< derived_type * >( t );
 
@@ -345,9 +345,9 @@ public:
   template< class FunctorType , class ResultType >
   KOKKOS_INLINE_FUNCTION static
   void apply_team( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t
-                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
       derived_type & m = * static_cast< derived_type * >( t );
 
@@ -356,7 +356,7 @@ public:
 };
 
 //----------------------------------------------------------------------------
-/** \brief  Base class for tasks with a result value in the Qthread execution space.
+/** \brief  Base class for tasks with a result value in the Qthreads execution space.
  *
  *  The FunctorType must be void because this class is accessed by the
  *  Future class for the task and result value.
@@ -365,8 +365,8 @@ public:
  *  can correctly static_cast from the 'root class' to this class.
  */
 template < class ResultType >
-class TaskMember< Kokkos::Qthread , ResultType , void >
-  : public TaskMember< Kokkos::Qthread , void , void >
+class TaskMember< Kokkos::Qthreads , ResultType , void >
+  : public TaskMember< Kokkos::Qthreads , void , void >
 {
 public:
 
@@ -379,7 +379,7 @@ public:
 
 protected:
 
-  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , void , void >  task_root_type ;
   typedef task_root_type::function_dealloc_type        function_dealloc_type ;
   typedef task_root_type::function_single_type         function_single_type ;
   typedef task_root_type::function_team_type           function_team_type ;
@@ -404,16 +404,16 @@ protected:
 };
 
 template< class ResultType , class FunctorType >
-class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-  : public TaskMember< Kokkos::Qthread , ResultType , void >
+class TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthreads , ResultType , void >
   , public FunctorType
 {
 public:
 
   typedef FunctorType  functor_type ;
 
-  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
-  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
+  typedef TaskMember< Kokkos::Qthreads , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , ResultType , void >  task_base_type ;
   typedef task_root_type::function_dealloc_type              function_dealloc_type ;
   typedef task_root_type::function_single_type               function_single_type ;
   typedef task_root_type::function_team_type                 function_team_type ;
@@ -447,16 +447,16 @@ public:
 namespace Kokkos {
 namespace Experimental {
 
-void wait( TaskPolicy< Kokkos::Qthread > & );
+void wait( TaskPolicy< Kokkos::Qthreads > & );
 
 template<>
-class TaskPolicy< Kokkos::Qthread >
+class TaskPolicy< Kokkos::Qthreads >
 {
 public:
 
-  typedef Kokkos::Qthread                        execution_space ;
+  typedef Kokkos::Qthreads                        execution_space ;
   typedef TaskPolicy                             execution_policy ;
-  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
+  typedef Kokkos::Impl::QthreadsTeamPolicyMember  member_type ;
 
 private:
 
@@ -650,7 +650,7 @@ public:
 
   static member_type & member_single();
 
-  friend void wait( TaskPolicy< Kokkos::Qthread > & );
+  friend void wait( TaskPolicy< Kokkos::Qthreads > & );
 };
 
 } /* namespace Experimental */
@@ -660,5 +660,5 @@ public:
 //----------------------------------------------------------------------------
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+#endif /* #define KOKKOS_QTHREADS_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55235cd6d27a9df0e40bd28dff8caa13df94073e
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
@@ -0,0 +1,319 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is handled here directly for the Qthread implementation.
+ */
+template<>
+class TaskQueue< Kokkos::Qthread > {
+private:
+
+  using execution_space = Kokkos::Qthread ;
+  using memory_space    = Kokkos::HostSpace
+  using device_type     = Kokkos::Device< execution_space, memory_space > ;
+  using memory_pool     = Kokkos::Experimental::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  friend class Kokkos::TaskScheduler< execution_space > ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { TASK_STATE_NULL         =  0,  ///<  Does not exist
+               TASK_STATE_CONSTRUCTING =  1,  ///<  Is under construction
+               TASK_STATE_WAITING      =  2,  ///<  Is waiting for execution
+               TASK_STATE_EXECUTING    =  4,  ///<  Is executing
+               TASK_STATE_RESPAWN      =  8,  ///<  Requested respawn
+               TASK_STATE_COMPLETE     = 16   ///<  Execution is complete
+             };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool  m_memory ;
+  unsigned     m_team_size ;   // Number of threads in a team
+  long         m_accum_alloc ; // Accumulated number of allocations
+  int          m_count_alloc ; // Current number of allocations
+  int          m_max_alloc ;   // Maximum number of allocations
+  int          m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue
+    ( const memory_space & arg_space,
+      unsigned const arg_memory_pool_capacity,
+      unsigned const arg_memory_pool_superblock_capacity_log2
+    );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION
+  void schedule( task_root_type * const );
+
+  // Reschedule a task
+  //   Precondition:
+  //     task is in Executing state
+  //     task->m_next == LockTag
+  //   Postcondition:
+  //     task is in Executing-Respawn state
+  //     task->m_next == 0 (no dependence)
+  KOKKOS_FUNCTION
+  void reschedule( task_root_type * );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  template< typename FunctorType >
+  void proc_set_apply( typename task_root_type::function_type * ptr )
+    {
+      specialization::template proc_set_apply< FunctorType >( ptr );
+    }
+
+  // Assign task pointer with reference counting of assigned tasks
+  template< typename LV, typename RV >
+  KOKKOS_FUNCTION static
+  void assign( TaskBase< execution_space, LV, void > ** const lhs,
+               TaskBase< execution_space, RV, void > *  const rhs )
+    {
+      using task_lhs = TaskBase< execution_space, LV, void > ;
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d }, 0x%lx { 0x%lx %d %d } )\n",
+            uintptr_t( lhs ? *lhs : 0 ),
+            uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_task_type : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ),
+            uintptr_t(rhs),
+            uintptr_t( rhs ? rhs->m_next : 0 ),
+            int( rhs ? rhs->m_task_type : 0 ),
+            int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs )
+      {
+        const int count = Kokkos::atomic_fetch_add( &((*lhs)->m_ref_count), -1 );
+
+        if ( ( 1 == count ) && ( (*lhs)->m_state == TASK_STATE_COMPLETE ) ) {
+          // Reference count is zero and task is complete, deallocate.
+          (*lhs)->m_queue->deallocate( *lhs, (*lhs)->m_alloc_size );
+        }
+        else if ( count <= 1 ) {
+          Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
+        }
+
+        // GEM: Should I check that there are no dependences here?  Can the state
+        //      be set to complete while there are still dependences?
+      }
+
+      if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count), 1 ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p, size_t n ); ///< Deallocate to the memory pool
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskBase< Kokkos::Qthread, void, void >
+{
+public:
+
+  enum : int16_t   { TaskTeam   = TaskBase< void, void, void >::TaskTeam,
+                     TaskSingle = TaskBase< void, void, void >::TaskSingle,
+                     Aggregate  = TaskBase< void, void, void >::Aggregate };
+
+  enum : uintptr_t { LockTag = TaskBase< void, void, void >::LockTag,
+                     EndTag  = TaskBase< void, void, void >::EndTag };
+
+  using execution_space = Kokkos::Qthread ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef void (* function_type) ( TaskBase *, void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Queue in which this task resides
+  TaskBase     * m_dep ;         ///< Dependence
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+  aligned_t      m_qfeb ;        ///< Qthread full/empty bit
+  int            m_state ;       ///< State of the task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase() noexcept
+    : m_apply(0),
+      m_queue(0),
+      m_dep(0),
+      m_ref_count(0),
+      m_alloc_size(0),
+      m_dep_count(0),
+      m_task_type( TaskSingle ),
+      m_priority( 1 /* TaskRegularPriority */ ),
+      m_qfeb(0),
+      m_state( queue_type::TASK_STATE_CONSTRUCTING )
+    {
+      qthread_empty( & m_qfeb ); // Set to full when complete
+    }
+
+  //----------------------------------------
+
+  static aligned_t qthread_func( void * arg );
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase ** aggregate_dependences()
+    { return reinterpret_cast<TaskBase**>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void requested_respawn()
+    { return m_state == queue_type::TASK_STATE_RESPAWN; }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Assign dependence to m_dep.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( 0 != Kokkos::atomic_exchange( & m_dep, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(dep->m_ref_count), 1 );
+      }
+    }
+
+  using get_return_type = void ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a9190c731c6034724b63094c55967de78caab64
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
@@ -0,0 +1,436 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( const TaskQueue< ExecSpace >::memory_space & arg_space,
+    unsigned const arg_memory_pool_capacity,
+    unsigned const arg_memory_pool_superblock_capacity_log2 )
+  : m_memory( arg_space,
+              arg_memory_pool_capacity,
+              arg_memory_pool_superblock_capacity_log2 )
+    m_team_size( unsigned( qthread_num_workers_local(NO_SHEPHERD) ) ),
+    m_accum_alloc(0),
+    m_count_alloc(0),
+    m_max_alloc(0),
+    m_ready_count(0)
+{}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that ready count is zero.
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p, size_t n )
+{
+  m_memory.deallocate( p, n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+#if 0
+  printf( "schedule( 0x%lx { %d %d %d }\n",
+          uintptr_t(task),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count );
+#endif
+
+  // The task has been constructed and is waiting to be executed.
+  task->m_state = TASK_STATE_WAITING ;
+
+  if ( task->m_task_type != task_root_type::Aggregate ) {
+    // Scheduling a single or team task.
+
+    // Increment active task count before spawning.
+    Kokkos::atomic_increment( m_ready_count );
+
+    if ( task->m_dep == 0 ) {
+      // Schedule a task with no dependences.
+
+      if ( task_root_type::TaskTeam == task->m_task_type && m_team_size > 1 ) {
+        // If more than one shepherd spawn on a shepherd other than this shepherd
+        const int num_shepherd  = qthread_num_shepherds();
+        const int this_shepherd = qthread_shep();
+        int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+        fprintf( stdout,
+                 "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+                 qthread_shep(),
+                 qthread_worker_local(NULL),
+                 reinterpret_cast<unsigned long>(this),
+                 spawn_shepherd,
+                 m_team_size - 1
+               );
+        fflush(stdout);
+#endif
+
+        qthread_spawn_cloneable(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          spawn_shepherd,
+          unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+          m_team_size - 1
+        );
+      }
+      else {
+        qthread_spawn(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          NO_SHEPHERD,
+          QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+        );
+      }
+    }
+    else if ( task->m_dep->m_task_type != task_root_type::Aggregate )
+    // Malloc the precondition array to pass to qthread_spawn().  For
+    // non-aggregate tasks, it is a single pointer since there are no
+    // dependences.  Qthreads will eventually free this allocation so memory will
+    // not be leaked. Is malloc thread-safe?  Should this call be guarded?  The
+    // memory can't be allocated from the pool allocator because Qthreads frees
+    // it using free().
+    aligned_t ** qprecon = (aligned_t **) malloc( sizeof(aligned_t *) );
+
+    *qprecon = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+    if ( task->m_task_type == task_root_type::TaskTeam && m_team_size > 1) {
+      // If more than one shepherd spawn on a shepherd other than this shepherd
+      const int num_shepherd  = qthread_num_shepherds();
+      const int this_shepherd = qthread_shep();
+      int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+  fprintf( stdout,
+           "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+           qthread_shep(),
+           qthread_worker_local(NULL),
+           reinterpret_cast<unsigned long>(this),
+           spawn_shepherd,
+           m_team_size - 1
+         );
+  fflush(stdout);
+#endif
+
+      qthread_spawn_cloneable(
+        & Task::qthread_func,
+        this,
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        spawn_shepherd,
+        unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+        m_team_size - 1
+      );
+    }
+    else {
+      qthread_spawn(
+        & Task::qthread_func, /* function */
+        this,                 /* function argument */
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        NO_SHEPHERD,
+        QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+      );
+    }
+  }
+  else {
+    // GEM: How do I handle an aggregate (when_all) task?
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
+{
+  // Precondition:
+  //   task is in Executing state
+  //   task->m_next == LockTag
+  //
+  // Postcondition:
+  //   task is in Executing-Respawn state
+  //   task->m_next == 0 (no dependence)
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+    Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if 0
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n",
+          uintptr_t(task),
+          uintptr_t(task->m_wait),
+          uintptr_t(task->m_next),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count
+        );
+  fflush( stdout );
+#endif
+
+  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != task->m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait, lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task, zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+
+        // Set x->m_next = zero  <=  no dependence
+
+        task_root_type * const next =
+          (task_root_type *) Kokkos::atomic_exchange( & x->m_next, zero );
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template<>
+aligned_t
+TaskBase< Kokkos::Qthreads, void, void >::qthread_func( void * arg )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = Kokkos::Impl::QthreadsTeamPolicyMember;
+
+  task_root_type * const task = reinterpret_cast< task_root_type * >( arg );
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state,
+                                          queue_type::TASK_STATE_WAITING,
+                                          queue_type::TASK_STATE_EXECUTING
+                                        );
+
+  if ( task_root_type::TaskTeam == task->m_task_type )
+  {
+    if ( 1 < task->m_queue->m_team_size ) {
+      // Team task with team size of more than 1.
+      Member::TaskTeam task_team_tag ;
+
+      // Initialize team size and rank with shephered info
+      Member member( task_team_tag );
+
+      (*task->m_apply)( task , & member );
+
+#if 0
+      fprintf( stdout,
+              "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n",
+              qthread_shep(),
+              qthread_worker_local(NULL),
+              reinterpret_cast<unsigned long>(task),
+              member.team_rank(),
+              member.team_size()
+            );
+      fflush(stdout);
+#endif
+
+      member.team_barrier();
+      if ( member.team_rank() == 0 ) task->closeout();
+      member.team_barrier();
+    }
+    else {
+      // Team task with team size of 1.
+      Member member ;
+      (*task->m_apply)( task , & member );
+      task->closeout();
+    }
+  }
+  else {
+    (*task->m_apply)( task );
+    task->closeout();
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthreads/README
similarity index 99%
rename from lib/kokkos/core/src/Qthread/README
rename to lib/kokkos/core/src/Qthreads/README
index 6e6c86a9efc2680916e2556bda28914833e6749d..e35b1f698ec7ca3e3ee020eeee4445de43023c78 100644
--- a/lib/kokkos/core/src/Qthread/README
+++ b/lib/kokkos/core/src/Qthreads/README
@@ -22,4 +22,3 @@ sh autogen.sh
 # install
 
 make install
-
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 0f69be9ed4db6547d52e1c96b735069fb2332081..b1f53489f432ba093ea2222b16c88ee68e005374 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -264,7 +264,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
   const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
 
   for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
   }
 
   exec.m_pool_state = ThreadsExec::Inactive ;
@@ -308,7 +308,7 @@ void ThreadsExec::fence()
 {
   if ( s_thread_pool_size[0] ) {
     // Wait for the root thread to complete:
-    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
   }
 
   s_current_function     = 0 ;
@@ -724,7 +724,7 @@ void ThreadsExec::initialize( unsigned thread_count ,
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
 }
@@ -777,7 +777,7 @@ void ThreadsExec::finalize()
   s_threads_process.m_pool_fan_size   = 0 ;
   s_threads_process.m_pool_state = ThreadsExec::Inactive ;
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
   #endif
 }
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 385dd492d0e8cc9417b50dd817538abf4f27246c..a6db02ebac84b96a736519a22a537bdc53ea6b1a 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -187,13 +187,13 @@ public:
       // Fan-in reduction with highest ranking thread as the root
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
       }
 
       if ( rev_rank ) {
         m_pool_state = ThreadsExec::Rendezvous ;
         // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
       }
       else {
         // Root thread does the reduction and broadcast
@@ -229,13 +229,13 @@ public:
       // Fan-in reduction with highest ranking thread as the root
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
       }
 
       if ( rev_rank ) {
         m_pool_state = ThreadsExec::Rendezvous ;
         // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
       }
       else {
         // Root thread does the reduction and broadcast
@@ -264,7 +264,7 @@ public:
 
         ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
 
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
 
         Join::join( f , reduce_memory() , fan.reduce_memory() );
       }
@@ -280,7 +280,7 @@ public:
       const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
 
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
       }
     }
 
@@ -312,7 +312,7 @@ public:
         ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
 
         // Wait: Active -> ReductionAvailable (or ScanAvailable)
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
         Join::join( f , work_value , fan.reduce_memory() );
       }
 
@@ -330,8 +330,8 @@ public:
 
           // Wait: Active             -> ReductionAvailable
           // Wait: ReductionAvailable -> ScanAvailable
-          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
-          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
 
           Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
         }
@@ -342,7 +342,7 @@ public:
 
         // Wait for all threads to complete inclusive scan
         // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
       }
 
       //--------------------------------
@@ -350,7 +350,7 @@ public:
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
         // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
         // Set: ScanAvailable -> Rendezvous
         fan.m_pool_state = ThreadsExec::Rendezvous ;
       }
@@ -377,13 +377,13 @@ public:
       // Wait for all threads to copy previous thread's inclusive scan value
       // Wait for all threads: Rendezvous -> ScanCompleted
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
       }
       if ( rev_rank ) {
         // Set: ScanAvailable -> ScanCompleted
         m_pool_state = ThreadsExec::ScanCompleted ;
         // Wait: ScanCompleted -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
       }
       // Set: ScanCompleted -> Active
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@@ -410,7 +410,7 @@ public:
       // Fan-in reduction with highest ranking thread as the root
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
       }
 
       for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@@ -418,7 +418,7 @@ public:
       if ( rev_rank ) {
         m_pool_state = ThreadsExec::Rendezvous ;
         // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
       }
       else {
         // Root thread does the thread-scan before releasing threads
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index b9edb64551f21d96f35a5276b06b501101b4e3e7..701495428193148f0efaf8dbf1cdededabd66460 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -49,6 +49,7 @@
 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
 
@@ -103,13 +104,13 @@ public:
 
       // Wait for fan-in threads
       for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
       }
 
       // If not root then wait for release
       if ( m_team_rank_rev ) {
         m_exec->state() = ThreadsExec::Rendezvous ;
-        Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
       }
 
       return ! m_team_rank_rev ;
@@ -350,6 +351,10 @@ public:
         const int team_rank_rev = pool_rank_rev % team.team_alloc();
         const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
         const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+        if(pool_league_rank_rev >= pool_league_size) {
+          m_invalid_thread = 1;
+          return;
+        }
         const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
 
         const int pool_num_teams       = m_exec->pool_size()/team.team_alloc();
@@ -505,7 +510,8 @@ private:
            , const int team_size_request )
    {
       const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
       const int team_grain = traits::execution_space::thread_pool_size(2);
 
       m_league_size = league_size_request ;
@@ -552,8 +558,12 @@ public:
 
   template< class FunctorType >
   inline static
-  int team_size_max( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(1); }
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+
 
   template< class FunctorType >
   static int team_size_recommended( const FunctorType & )
@@ -819,9 +829,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
 #pragma ivdep
 #endif
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
+    lambda(i,result);
   }
 }
 
@@ -835,18 +843,14 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result ) {
 
-  ValueType result = init_result;
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
+    lambda(i,result);
   }
-  init_result = result;
 }
 
 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4db3e15ef4593422eca54ab5d295f5469d3a5ad
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -0,0 +1,2356 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
+#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#define KOKKOS_MDRANGE_IVDEP
+#endif
+
+
+#ifdef KOKKOS_MDRANGE_IVDEP
+ #define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
+#else
+ #define KOKKOS_ENABLE_IVDEP_MDRANGE
+#endif
+
+
+
+namespace Kokkos { namespace Experimental { namespace Impl {
+
+// Temporary, for testing new loop macros
+#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
+
+
+#define LOOP_1L(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE \
+  for( type i0=0; i0<static_cast<type>(tile[0]); ++i0)
+
+#define LOOP_2L(type, tile) \
+  for( type i1=0; i1<static_cast<type>(tile[1]); ++i1) \
+  LOOP_1L(type, tile)
+
+#define LOOP_3L(type, tile) \
+  for( type i2=0; i2<static_cast<type>(tile[2]); ++i2) \
+  LOOP_2L(type, tile)
+
+#define LOOP_4L(type, tile) \
+  for( type i3=0; i3<static_cast<type>(tile[3]); ++i3) \
+  LOOP_3L(type, tile)
+
+#define LOOP_5L(type, tile) \
+  for( type i4=0; i4<static_cast<type>(tile[4]); ++i4) \
+  LOOP_4L(type, tile)
+
+#define LOOP_6L(type, tile) \
+  for( type i5=0; i5<static_cast<type>(tile[5]); ++i5) \
+  LOOP_5L(type, tile)
+
+#define LOOP_7L(type, tile) \
+  for( type i6=0; i6<static_cast<type>(tile[6]); ++i6) \
+  LOOP_6L(type, tile)
+
+#define LOOP_8L(type, tile) \
+  for( type i7=0; i7<static_cast<type>(tile[7]); ++i7) \
+  LOOP_7L(type, tile)
+
+
+#define LOOP_1R(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE \
+  for ( type i0=0; i0<static_cast<type>(tile[0]); ++i0 )
+
+#define LOOP_2R(type, tile) \
+  LOOP_1R(type, tile) \
+  for ( type i1=0; i1<static_cast<type>(tile[1]); ++i1 )
+
+#define LOOP_3R(type, tile) \
+  LOOP_2R(type, tile) \
+  for ( type i2=0; i2<static_cast<type>(tile[2]); ++i2 )
+
+#define LOOP_4R(type, tile) \
+  LOOP_3R(type, tile) \
+  for ( type i3=0; i3<static_cast<type>(tile[3]); ++i3 )
+
+#define LOOP_5R(type, tile) \
+  LOOP_4R(type, tile) \
+  for ( type i4=0; i4<static_cast<type>(tile[4]); ++i4 )
+
+#define LOOP_6R(type, tile) \
+  LOOP_5R(type, tile) \
+  for ( type i5=0; i5<static_cast<type>(tile[5]); ++i5 )
+
+#define LOOP_7R(type, tile) \
+  LOOP_6R(type, tile) \
+  for ( type i6=0; i6<static_cast<type>(tile[6]); ++i6 )
+
+#define LOOP_8R(type, tile) \
+  LOOP_7R(type, tile) \
+  for ( type i7=0; i7<static_cast<type>(tile[7]); ++i7 )
+
+
+#define LOOP_ARGS_1 i0 + m_offset[0]
+#define LOOP_ARGS_2 LOOP_ARGS_1, i1 + m_offset[1]
+#define LOOP_ARGS_3 LOOP_ARGS_2, i2 + m_offset[2]
+#define LOOP_ARGS_4 LOOP_ARGS_3, i3 + m_offset[3]
+#define LOOP_ARGS_5 LOOP_ARGS_4, i4 + m_offset[4]
+#define LOOP_ARGS_6 LOOP_ARGS_5, i5 + m_offset[5]
+#define LOOP_ARGS_7 LOOP_ARGS_6, i6 + m_offset[6]
+#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
+
+
+
+// New Loop Macros...
+// parallel_for, non-tagged
+#define APPLY( func, ... ) \
+  func( __VA_ARGS__ );
+
+// LayoutRight
+// d = 0 to start
+#define LOOP_R_1( func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY( func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define LOOP_R_2( func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_R_1( func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define LOOP_R_3( func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_R_2( func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define LOOP_R_4( func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_R_3( func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define LOOP_R_5( func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_R_4( func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define LOOP_R_6( func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_R_5( func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define LOOP_R_7( func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_R_6( func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define LOOP_R_8( func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_R_7( func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define LOOP_L_1( func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY( func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define LOOP_L_2( func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_L_1( func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_3( func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_L_2( func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_4( func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_L_3( func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_5( func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_L_4( func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_6( func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_L_5( func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_7( func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_L_6( func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_8( func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_L_7( func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+// TODO: rank not necessary to pass through, can hardcode the values
+#define LOOP_LAYOUT_1( func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    APPLY( func, i0 + m_offset[0] )              \
+  } 
+
+#define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      LOOP_L_1( func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      LOOP_L_2( func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      LOOP_L_3( func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      LOOP_L_4( func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      LOOP_L_5( func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      LOOP_L_6( func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      LOOP_L_7( func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_1( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_1( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_2( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_2( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_2( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_3( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_3( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_3( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_4( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_4( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_4( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_5( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_5( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_5( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_6( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_6( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_6( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_7( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_7( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_7( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_8( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_8( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_8( func, type, is_left, m_offset, extent_partial, rank ) }
+
+
+// parallel_reduce, non-tagged
+// Reduction version
+#define APPLY_REDUX( val, func, ... ) \
+  func( __VA_ARGS__, val );
+
+// LayoutRight
+// d = 0 to start
+#define LOOP_R_1_REDUX( val, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY_REDUX( val, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define LOOP_R_2_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_R_1_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define LOOP_R_3_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_R_2_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define LOOP_R_4_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_R_3_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define LOOP_R_5_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_R_4_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define LOOP_R_6_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_R_5_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define LOOP_R_7_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_R_6_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define LOOP_R_8_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_R_7_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define LOOP_L_1_REDUX( val, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY_REDUX( val, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define LOOP_L_2_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_L_1_REDUX( val, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_3_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_L_2_REDUX( val, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_4_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_L_3_REDUX( val, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_5_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_L_4_REDUX( val, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_6_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_L_5_REDUX( val, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_7_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_L_6_REDUX( val, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_8_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_L_7_REDUX( val, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+#define LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    APPLY_REDUX( val, func, i0 + m_offset[0] )              \
+  } 
+
+#define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      LOOP_L_1_REDUX( val, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      LOOP_L_2_REDUX( val, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      LOOP_L_3_REDUX( val, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      LOOP_L_4_REDUX( val, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      LOOP_L_5_REDUX( val, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      LOOP_L_6_REDUX( val, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      LOOP_L_7_REDUX( val, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_2_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_3_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_4_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_5_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_6_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_7_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_8_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+// end New Loop Macros
+
+
+// tagged macros
+#define TAGGED_APPLY( tag, func, ... ) \
+  func( tag, __VA_ARGS__ );
+
+// LayoutRight
+// d = 0 to start
+#define TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY( tag, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_8( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY( tag, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_8( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+// TODO: rank not necessary to pass through, can hardcode the values
+#define TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    TAGGED_APPLY( tag, func, i0 + m_offset[0] )              \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_2( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_3( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_4( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_5( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_6( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_7( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_8( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+
+// parallel_reduce, tagged
+// Reduction version
+#define TAGGED_APPLY_REDUX( val, tag, func, ... ) \
+  func( tag, __VA_ARGS__, val );
+
+// LayoutRight
+// d = 0 to start
+#define TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_8_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_8_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+#define TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] )              \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_2_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_3_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_4_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_5_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_6_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_7_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_8_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+// end tagged macros
+
+
+
+
+// Structs for calling loops
+template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
+struct Tile_Loop_Type;
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<1, IsLeft, IType, void, void >
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_1( func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_1_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<2, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_2( func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_2_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<3, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_3( func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_3_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<4, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_4( func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_4_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<5, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_5( func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_5_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<6, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_6( func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_6_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<7, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_7( func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_7_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<8, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_8( func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_8_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+};
+
+// tagged versions
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<1, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type >
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_1( Tagged(), func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_1_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<2, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_2( Tagged(), func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_2_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<3, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_3( Tagged(), func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_3_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<4, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_4( Tagged(), func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_4_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<5, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_5( Tagged(), func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_5_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<6, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_6( Tagged(), func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_6_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<7, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_7( Tagged(), func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_7_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_8( Tagged(), func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_8_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+};
+// end Structs for calling loops
+
+
+template <typename T>
+using is_void = std::is_same< T , void >;
+
+template < typename RP
+         , typename Functor
+         , typename Tag = void
+         , typename ValueType = void
+         , typename Enable = void
+         >
+struct HostIterateTile;
+
+//For ParallelFor
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = ValueType;
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func )
+    : m_rp(rp)
+    , m_func(func)
+  {
+  }
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) 
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag 
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { 
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else 
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank w/ cuda+serial
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args...);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args...);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+//  value_type  & m_v;
+
+};
+
+
+// ValueType: For reductions
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = ValueType;
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func, value_type & v )
+    : m_rp(rp) //Cuda 7.0 does not like braces...
+    , m_func(func)
+    , m_v(v) // use with non-void ValueType struct
+  {
+// Errors due to braces rather than parenthesis for init (with cuda 7.0)
+//      /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: error: too many braces around initializer for ‘int’ [-fpermissive]
+//      /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: error: aggregate value used where an integer was expected
+  }
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) 
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag 
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { 
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else 
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args... , m_v);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args... , m_v);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  value_type  & m_v;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+
+};
+
+
+// ------------------------------------------------------------------ //
+
+// MDFunctor - wraps the range_policy and functor to pass to IterateTile
+// Serial, Threads, OpenMP
+// Cuda uses DeviceIterateTile directly within md_parallel_for
+// ParallelReduce
+template < typename MDRange, typename Functor, typename ValueType = void >
+struct MDFunctor
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using value_type   = ValueType;
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+                                                                           , Functor
+                                                                           , work_tag
+                                                                           , value_type
+                                                                           >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
+    : m_range( range )
+    , m_func( f )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+//  KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
+  inline
+  void operator()(index_type t, value_type & v) const
+  {
+    iterate_type(m_range, m_func, v)(t);
+  }
+
+  MDRange   m_range;
+  Functor   m_func;
+};
+
+// ParallelFor
+template < typename MDRange, typename Functor >
+struct MDFunctor< MDRange, Functor, void >
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+                                                                           , Functor
+                                                                           , work_tag
+                                                                           , void
+                                                                           >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  inline
+  void operator()(index_type t) const
+  {
+    iterate_type(m_range, m_func)(t);
+  }
+
+  MDRange m_range;
+  Functor m_func;
+};
+
+#undef KOKKOS_ENABLE_NEW_LOOP_MACROS
+
+} } } //end namespace Kokkos::Experimental::Impl
+
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
index 0ffbc0548ab663c9b6afa8799f162e3c7bbd7510..7d7fd3d1334901f1cc57e554f6c46f7f17ca09c4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
   return __ffs(i) - 1;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_ffs(i) - 1;
-#elif defined( __INTEL_COMPILER )
+#elif defined( KOKKOS_COMPILER_INTEL )
   return _bit_scan_forward(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
 #else
-
   unsigned t = 1u;
   int r = 0;
   while ( i && ( i & t == 0 ) )
@@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
   enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
 #if defined( __CUDA_ARCH__ )
   return shift - __clz(i);
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return _bit_scan_reverse(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return shift - __cntlz4(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return shift - __builtin_clz(i);
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_reverse(i);
 #else
   unsigned t = 1u << shift;
   int r = 0;
@@ -101,10 +104,12 @@ int bit_count( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
   return __popc(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_popcount(i);
 #elif defined ( __INTEL_COMPILER )
   return _popcnt32(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __popcnt4(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
 #else
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
   i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index cd38eaa9da867a31a9274684f235456b30590d92..7c38430c44986d5dcffad9c03c9f587ffdc91863 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
   }
 #endif
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
 #endif
 }
@@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 void finalize_internal( const bool all_spaces = false )
 {
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
 #endif
 
@@ -449,5 +449,323 @@ void fence()
   Impl::fence_internal();
 }
 
+void print_configuration( std::ostream & out , const bool detail )
+{
+  std::ostringstream msg;
+
+  msg << "Compiler:" << std::endl;
+#ifdef KOKKOS_COMPILER_APPLECC
+  msg << "  KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CLANG
+  msg << "  KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CRAYC
+  msg << "  KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_GNU
+  msg << "  KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_IBM
+  msg << "  KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_INTEL
+  msg << "  KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_NVCC
+  msg << "  KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_PGI
+  msg << "  KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
+#endif
+
+
+  msg << "Architecture:" << std::endl;
+#ifdef KOKKOS_ENABLE_ISA_KNC
+  msg << "  KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_X86_64
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
+#endif
+
+
+  msg << "Devices:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA: ";
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PTHREAD: ";
+#ifdef KOKKOS_ENABLE_PTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_STDTHREAD: ";
+#ifdef KOKKOS_ENABLE_STDTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINTHREAD: ";
+#ifdef KOKKOS_ENABLE_WINTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Default Device:" << std::endl;
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Atomics:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_GNU_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_GNU_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Vectorization:" << std::endl;
+  msg << "  KOKKOS_ENABLE_PRAGMA_IVDEP: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_SIMD: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_UNROLL: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_VECTOR: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+  msg << "Memory:" << std::endl;
+  msg << "  KOKKOS_ENABLE_HBWSPACE: ";
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
+#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_POSIX_MEMALIGN: ";
+#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_ASM: ";
+#ifdef KOKKOS_ENABLE_ASM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX1Z: ";
+#ifdef KOKKOS_ENABLE_CXX1Z
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_HWLOC: ";
+#ifdef KOKKOS_ENABLE_HWLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_LIBRT: ";
+#ifdef KOKKOS_ENABLE_LIBRT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_MPI: ";
+#ifdef KOKKOS_ENABLE_MPI
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PROFILING: ";
+#ifdef KOKKOS_ENABLE_PROFILING
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "Cuda Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
+#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
+#ifdef KOKKOS_ENABLE_CUSPARSE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#endif
+
+  msg << "\nRuntime Configuration:" << std::endl;
+#ifdef KOKKOS_ENABLE_CUDA
+  Cuda::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  OpenMP::print_configuration(msg, detail);
+#endif
+#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
+  Threads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_QTHREADS
+  Qthreads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_SERIAL
+  Serial::print_configuration(msg, detail);
+#endif
+
+  out << msg.str() << std::endl;
+}
+
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b425b3f19fa159925364d20ac6d5bc85b45bebae
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -0,0 +1,653 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORANALYSIS_HPP
+#define KOKKOS_FUNCTORANALYSIS_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Reducer.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct FunctorPatternInterface {
+  struct FOR {};
+  struct REDUCE {};
+  struct SCAN {};
+};
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If 'value_type' is not explicitly declared in the functor
+ *  then attempt to deduce the type from FunctorType::operator()
+ *  interface used by the pattern and policy.
+ *
+ *  For the REDUCE pattern generate a Reducer and finalization function
+ *  derived from what is available within the functor.
+ */
+template< typename PatternInterface , class Policy , class Functor >
+struct FunctorAnalysis {
+private:
+
+  using FOR    = FunctorPatternInterface::FOR ;
+  using REDUCE = FunctorPatternInterface::REDUCE ;
+  using SCAN   = FunctorPatternInterface::SCAN ;
+
+  //----------------------------------------
+
+  struct VOID {};
+
+  template< typename P = Policy , typename = std::false_type >
+  struct has_work_tag
+    {
+      using type = void ;
+      using wtag = VOID ;
+    };
+
+  template< typename P >
+  struct has_work_tag
+    < P , typename std::is_same< typename P::work_tag , void >::type >
+    {
+      using type = typename P::work_tag ;
+      using wtag = typename P::work_tag ;
+    };
+
+  using Tag  = typename has_work_tag<>::type ;
+  using WTag = typename has_work_tag<>::wtag ;
+
+  //----------------------------------------
+  // Check for Functor::value_type, which is either a simple type T or T[]
+
+  template< typename F , typename = std::false_type >
+  struct has_value_type { using type = void ; };
+
+  template< typename F >
+  struct has_value_type
+    < F , typename std::is_same< typename F::value_type , void >::type >
+  {
+    using type = typename F::value_type ;
+
+    static_assert( ! std::is_reference< type >::value &&
+                   std::rank< type >::value <= 1 &&
+                   std::extent< type >::value == 0
+                 , "Kokkos Functor::value_type is T or T[]" );
+  };
+
+  //----------------------------------------
+  // If Functor::value_type does not exist then evaluate operator(),
+  // depending upon the pattern and whether the policy has a work tag,
+  // to determine the reduction or scan value_type.
+
+  template< typename F
+          , typename P = PatternInterface
+          , typename V = typename has_value_type<F>::type
+          , bool     T = std::is_same< Tag , void >::value
+          >
+  struct deduce_value_type { using type = V ; };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , true > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , false > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , true > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , false > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & , I ) const );
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  //----------------------------------------
+
+  using candidate_type = typename deduce_value_type< Functor >::type ;
+
+  enum { candidate_is_void  = std::is_same< candidate_type , void >::value
+       , candidate_is_array = std::rank< candidate_type >::value == 1 };
+
+  //----------------------------------------
+
+public:
+
+  using value_type = typename std::remove_extent< candidate_type >::type ;
+
+  static_assert( ! std::is_const< value_type >::value
+               , "Kokkos functor operator reduce argument cannot be const" );
+
+private:
+
+  // Stub to avoid defining a type 'void &'
+  using ValueType = typename
+    std::conditional< candidate_is_void , VOID , value_type >::type ;
+
+public:
+
+  using pointer_type = typename
+    std::conditional< candidate_is_void , void , ValueType * >::type ;
+
+  using reference_type = typename
+    std::conditional< candidate_is_array  , ValueType * , typename
+    std::conditional< ! candidate_is_void , ValueType & , void >
+    ::type >::type ;
+
+private:
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< IsArray , unsigned >::type
+  get_length( FF const & f ) { return f.value_count ; }
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< ! IsArray , unsigned >::type
+  get_length( FF const & ) { return 1 ; }
+
+public:
+
+  enum { StaticValueSize = ! candidate_is_void &&
+                           ! candidate_is_array
+                         ? sizeof(ValueType) : 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
+
+  //----------------------------------------
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Unknown & )
+    { return 1 ; }
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Unknown & )
+    { return sizeof(ValueType); }
+
+private:
+
+  enum INTERFACE : int
+    { DISABLE           = 0
+    , NO_TAG_NOT_ARRAY  = 1
+    , NO_TAG_IS_ARRAY   = 2
+    , HAS_TAG_NOT_ARRAY = 3
+    , HAS_TAG_IS_ARRAY  = 4
+    , DEDUCED =
+       ! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
+       std::is_same<Tag,void>::value
+         ? (candidate_is_array ? NO_TAG_IS_ARRAY  : NO_TAG_NOT_ARRAY)
+         : (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
+    };
+
+  //----------------------------------------
+  // parallel_reduce join operator
+
+  template< class F , INTERFACE >
+  struct has_join_function ;
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( dst , src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , dst , src ); }
+    };
+
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceJoin
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+       {
+         const int n = FunctorAnalysis::value_count( f );
+         for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
+       }
+    };
+
+  template< class F >
+  struct DeduceJoin< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const &
+               , ValueType volatile *
+               , ValueType volatile const * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceJoin< F , I ,
+    decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
+    : public has_join_function<F,I> {};
+
+  //----------------------------------------
+
+  template< class , INTERFACE >
+  struct has_init_function ;
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceInit
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
+    };
+
+  template< class F >
+  struct DeduceInit< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceInit< F , I ,
+    decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
+    : public has_init_function<F,I> {};
+
+  //----------------------------------------
+
+public:
+
+  struct Reducer
+  {
+  private:
+
+    Functor     const & m_functor ;
+    ValueType * const   m_result ;
+    int         const   m_length ;
+
+  public:
+
+    using reducer        = Reducer ;
+    using value_type     = FunctorAnalysis::value_type ;
+    using memory_space   = void ;
+    using reference_type = FunctorAnalysis::reference_type ;
+
+    KOKKOS_INLINE_FUNCTION
+    void join( ValueType volatile * dst
+             , ValueType volatile const * src ) const noexcept
+      { DeduceJoin<>::join( m_functor , dst , src ); }
+
+    KOKKOS_INLINE_FUNCTION
+    void init( ValueType * dst ) const noexcept
+      { DeduceInit<>::init( m_functor , dst ); }
+
+    KOKKOS_INLINE_FUNCTION explicit
+    constexpr Reducer( Functor const & arg_functor
+                     , ValueType     * arg_value = 0
+                     , int             arg_length = 0 ) noexcept
+      : m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
+
+    KOKKOS_INLINE_FUNCTION
+    constexpr int length() const noexcept { return m_length ; }
+
+    KOKKOS_INLINE_FUNCTION
+    ValueType & operator[]( int i ) const noexcept
+      { return m_result[i]; }
+
+  private:
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< IsArray , ValueType * >::type
+    ref() const noexcept { return m_result ; }
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< ! IsArray , ValueType & >::type
+    ref() const noexcept { return *m_result ; }
+
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    auto result() const noexcept
+      -> decltype( Reducer::template ref< candidate_is_array >() )
+      { return Reducer::template ref< candidate_is_array >(); }
+ };
+
+  //----------------------------------------
+
+private:
+
+  template< class , INTERFACE >
+  struct has_final_function ;
+
+  // No tag, not array
+  template< class F >
+  struct has_final_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( *dst ); }
+    };
+
+  // No tag, is array
+  template< class F >
+  struct has_final_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( dst ); }
+    };
+
+  // Has tag, not array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), *dst ); }
+    };
+
+  // Has tag, is array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceFinal
+    {
+      KOKKOS_INLINE_FUNCTION
+      static void final( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceFinal< F , I ,
+    decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
+    : public has_init_function<F,I> {};
+
+public:
+
+  static void final( Functor const & f , ValueType * result )
+    { DeduceFinal<>::final( f , result ); }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORANALYSIS_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index 96d30d0c4acac8af49f6b2c25ef2bb1c04508a28..eb1f5ce96c28fa05d70dd2bf840133688d82b247 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -62,7 +62,7 @@
 #include <memkind.h>
 #endif
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 
@@ -198,7 +198,7 @@ void * HBWSpace::allocate( const size_t arg_alloc_size ) const
     case STD_MALLOC: msg << "STD_MALLOC" ; break ;
     }
     msg << " ]( " << arg_alloc_size << " ) FAILED" ;
-    if ( ptr == NULL ) { msg << " NULL" ; } 
+    if ( ptr == NULL ) { msg << " NULL" ; }
     else { msg << " NOT ALIGNED " << ptr ; }
 
     std::cerr << msg.str() << std::endl ;
@@ -218,7 +218,7 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
     if ( m_alloc_mech == STD_MALLOC ) {
       void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
       memkind_free(MEMKIND_TYPE, alloc_ptr );
-    }    
+    }
 
   }
 }
@@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::deallocateData(
       Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
@@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
       )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -297,7 +297,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
 
 void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
-                , const std::string & arg_alloc_label 
+                , const std::string & arg_alloc_label
                 , const size_t arg_alloc_size )
 {
   if ( ! arg_alloc_size ) return (void *) 0 ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index 3cd603728e52f1b851219a01f91eb0d5358e4c86..67be86c9a3ed8595a35915f06a4b8e4ea5ded0b3 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,14 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <algorithm>
 #include <Kokkos_Macros.hpp>
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 /*--------------------------------------------------------------------------*/
@@ -292,7 +292,7 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
     case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC" ; break ;
     }
     msg << " ]( " << arg_alloc_size << " ) FAILED" ;
-    if ( ptr == NULL ) { msg << " NULL" ; } 
+    if ( ptr == NULL ) { msg << " NULL" ; }
     else { msg << " NOT ALIGNED " << ptr ; }
 
     std::cerr << msg.str() << std::endl ;
@@ -312,7 +312,7 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
     if ( m_alloc_mech == STD_MALLOC ) {
       void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
       free( alloc_ptr );
-    }    
+    }
 
 #if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
     else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
@@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::HostSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::deallocateData(
       Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
@@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
       )
   , m_space( arg_space )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
    }
@@ -406,7 +406,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
 
 void * SharedAllocationRecord< Kokkos::HostSpace , void >::
 allocate_tracked( const Kokkos::HostSpace & arg_space
-                , const std::string & arg_alloc_label 
+                , const std::string & arg_alloc_label
                 , const size_t arg_alloc_size )
 {
   if ( ! arg_alloc_size ) return (void *) 0 ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac200209c72bca381f60b9564944bc444748f0fb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -0,0 +1,463 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <limits>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void HostThreadTeamData::organize_pool
+  ( HostThreadTeamData * members[] , const int size )
+{
+  bool ok = true ;
+
+  // Verify not already a member of a pool:
+  for ( int rank = 0 ; rank < size && ok ; ++rank ) {
+    ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+  }
+
+  if ( ok ) {
+
+    int64_t * const root_scratch = members[0]->m_scratch ;
+
+    for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
+      root_scratch[i] = 0 ;
+    }
+
+    {
+      HostThreadTeamData ** const pool =
+        (HostThreadTeamData **) (root_scratch + m_pool_members);
+
+      // team size == 1, league size == pool_size
+
+      for ( int rank = 0 ; rank < size ; ++rank ) {
+        HostThreadTeamData * const mem = members[ rank ] ;
+        mem->m_pool_scratch = root_scratch ;
+        mem->m_team_scratch = mem->m_scratch ;
+        mem->m_pool_rank    = rank ;
+        mem->m_pool_size    = size ;
+        mem->m_team_base    = rank ;
+        mem->m_team_rank    = 0 ;
+        mem->m_team_size    = 1 ;
+        mem->m_team_alloc   = 1 ;
+        mem->m_league_rank  = rank ;
+        mem->m_league_size  = size ;
+        mem->m_pool_rendezvous_step = 0 ;
+        mem->m_team_rendezvous_step = 0 ;
+        pool[ rank ] = mem ;
+      }
+    }
+
+    Kokkos::memory_fence();
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
+  }
+}
+
+void HostThreadTeamData::disband_pool()
+{
+   m_work_range.first  = -1 ;
+   m_work_range.second = -1 ;
+   m_pool_scratch = 0 ;
+   m_team_scratch = 0 ;
+   m_pool_rank    = 0 ;
+   m_pool_size    = 1 ;
+   m_team_base    = 0 ;
+   m_team_rank    = 0 ;
+   m_team_size    = 1 ;
+   m_team_alloc   = 1 ;
+   m_league_rank  = 0 ;
+   m_league_size  = 1 ;
+   m_pool_rendezvous_step = 0 ;
+   m_team_rendezvous_step = 0 ;
+}
+
+int HostThreadTeamData::organize_team( const int team_size )
+{
+  // Pool is initialized
+  const bool ok_pool = 0 != m_pool_scratch ;
+
+  // Team is not set
+  const bool ok_team =
+    m_team_scratch == m_scratch &&
+    m_team_base    == m_pool_rank &&
+    m_team_rank    == 0 &&
+    m_team_size    == 1 &&
+    m_team_alloc   == 1 &&
+    m_league_rank  == m_pool_rank &&
+    m_league_size  == m_pool_size ;
+
+  if ( ok_pool && ok_team ) {
+
+    if ( team_size <= 0 ) return 0 ; // No teams to organize
+
+    if ( team_size == 1 ) return 1 ; // Already organized in teams of one
+
+    HostThreadTeamData * const * const pool =
+      (HostThreadTeamData **) (m_pool_scratch + m_pool_members);
+
+    // "league_size" in this context is the number of concurrent teams
+    // that the pool can accommodate.  Excess threads are idle.
+    const int league_size     = m_pool_size / team_size ;
+    const int team_alloc_size = m_pool_size / league_size ;
+    const int team_alloc_rank = m_pool_rank % team_alloc_size ;
+    const int league_rank     = m_pool_rank / team_alloc_size ;
+    const int team_base_rank  = league_rank * team_alloc_size ;
+
+    m_team_scratch = pool[ team_base_rank ]->m_scratch ;
+    m_team_base    = team_base_rank ;
+    // This needs to check overflow, if m_pool_size % team_alloc_size !=0
+    // there are two corner cases:
+    // (i) if team_alloc_size == team_size there might be a non-full
+    //     zombi team around (for example m_pool_size = 5 and team_size = 2
+    // (ii) if team_alloc > team_size then the last team might have less
+    //      threads than the others
+    m_team_rank    = ( team_base_rank + team_size <= m_pool_size ) &&
+                     ( team_alloc_rank < team_size ) ?
+                     team_alloc_rank : -1;
+    m_team_size    = team_size ;
+    m_team_alloc   = team_alloc_size ;
+    m_league_rank  = league_rank ;
+    m_league_size  = league_size ;
+    m_team_rendezvous_step = 0 ;
+
+    if ( team_base_rank == m_pool_rank ) {
+      // Initialize team's rendezvous memory
+      for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
+        m_scratch[i] = 0 ;
+      }
+      // Make sure team's rendezvous memory initialized
+      // is written before proceeding.
+      Kokkos::memory_fence();
+    }
+
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
+
+    if ( pool_rendezvous() ) {
+      pool_rendezvous_release();
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
+  }
+
+  return 0 <= m_team_rank ;
+}
+
+void HostThreadTeamData::disband_team()
+{
+  m_team_scratch = m_scratch ;
+  m_team_base    = m_pool_rank ;
+  m_team_rank    = 0 ;
+  m_team_size    = 1 ;
+  m_team_alloc   = 1 ;
+  m_league_rank  = m_pool_rank ;
+  m_league_size  = m_pool_size ;
+  m_team_rendezvous_step = 0 ;
+}
+
+//----------------------------------------------------------------------------
+/* pattern for rendezvous
+ *
+ *  if ( rendezvous() ) {
+ *     ... all other threads are still in team_rendezvous() ...
+ *     rendezvous_release();
+ *     ... all other threads are released from team_rendezvous() ...
+ *  }
+ */
+
+int HostThreadTeamData::rendezvous( int64_t * const buffer
+                                  , int & rendezvous_step
+                                  , int const size
+                                  , int const rank ) noexcept
+{
+  enum : int { shift_byte = 3 };
+  enum : int { size_byte  = ( 01 << shift_byte ) }; // == 8
+  enum : int { mask_byte  = size_byte - 1 };
+
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Cycle step values: 1 <= step <= size_val_cycle
+  // An odd multiple of memory cycle so that when a memory location
+  // is reused it has a different value.
+  // Must be representable within a single byte: size_val_cycle < 16
+
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called by rank = [ 0 .. size )
+  //   buffer aligned to int64_t[4]
+
+  // A sequence of rendezvous uses four cycled locations in memory
+  // and non-equal cycled synchronization values to
+  // 1) prevent rendezvous from overtaking one another and
+  // 2) give each spin wait location an int64_t[4] span
+  //    so that it has its own cache line.
+
+  const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
+
+  rendezvous_step = step ;
+
+  // The leading int64_t[4] span is for thread 0 to write
+  // and all other threads to read spin-wait.
+  // sync_offset is the index into this array for this step.
+
+  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
+
+  union {
+    int64_t full ;
+    int8_t  byte[8] ;
+  } value ;
+
+  if ( rank ) {
+
+    const int group_begin = rank << shift_byte ; // == rank * size_byte
+
+    if ( group_begin < size ) {
+
+      //  This thread waits for threads
+      //   [ group_begin .. group_begin + 8 )
+      //   [ rank*8      .. rank*8 + 8      )
+      // to write to their designated bytes.
+
+      const int end = group_begin + size_byte < size
+                    ? size_byte : size - group_begin ;
+
+      value.full = 0 ;
+      for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+      store_fence(); // This should not be needed but fixes #742
+
+      spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                          , value.full );
+    }
+
+    {
+      // This thread sets its designated byte.
+      //   ( rank % size_byte ) +
+      //   ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
+      //   ( sync_offset * size_byte )
+      const int offset = ( rank & mask_byte )
+                       + ( ( rank & ~mask_byte ) << shift_mem_cycle )
+                       + ( sync_offset << shift_byte );
+
+      // All of this thread's previous memory stores must be complete before
+      // this thread stores the step value at this thread's designated byte
+      // in the shared synchronization array.
+
+      Kokkos::memory_fence();
+
+      ((volatile int8_t*) buffer)[ offset ] = int8_t( step );
+
+      // Memory fence to push the previous store out
+      Kokkos::memory_fence();
+    }
+
+    // Wait for thread 0 to release all other threads
+
+    spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
+
+  }
+  else {
+    // Thread 0 waits for threads [1..7]
+    // to write to their designated bytes.
+
+    const int end = size_byte < size ? 8 : size ;
+
+    value.full = 0 ;
+    for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+    spinwait_until_equal( buffer[ sync_offset ], value.full );
+  }
+
+  return rank ? 0 : 1 ;
+}
+
+void HostThreadTeamData::
+  rendezvous_release( int64_t * const buffer
+                    , int const rendezvous_step ) noexcept
+{
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Requires:
+  //   Called after team_rendezvous
+  //   Called only by true == team_rendezvous(root)
+
+  // Memory fence to be sure all previous writes are complete:
+  Kokkos::memory_fence();
+
+  ((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
+     int64_t( rendezvous_step );
+
+  // Memory fence to push the store out
+  Kokkos::memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+int HostThreadTeamData::get_work_stealing() noexcept
+{
+  pair_int_t w( -1 , -1 );
+
+  if ( 1 == m_team_size || team_rendezvous() ) {
+
+    // Attempt first from beginning of my work range
+    for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
+
+      // Query and attempt to update m_work_range
+      //   from: [ w.first     , w.second )
+      //   to:   [ w.first + 1 , w.second ) = w_new
+      //
+      // If w is invalid then is just a query.
+
+      const pair_int_t w_new( w.first + 1 , w.second );
+
+      w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
+
+      if ( w.first < w.second ) {
+        // m_work_range is viable
+
+        // If steal is successful then don't repeat attempt to steal
+        attempt = ! ( w_new.first  == w.first + 1 &&
+                      w_new.second == w.second );
+      }
+      else {
+        // m_work_range is not viable
+        w.first  = -1 ;
+        w.second = -1 ;
+
+        attempt = 0 ;
+      }
+    }
+
+    if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
+
+      HostThreadTeamData * const * const pool =
+        (HostThreadTeamData**)( m_pool_scratch + m_pool_members );
+
+      // Attempt from begining failed, try to steal from end of neighbor
+
+      pair_int_t volatile * steal_range =
+        & ( pool[ m_steal_rank ]->m_work_range );
+
+      for ( int attempt = true ; attempt ; ) {
+
+        // Query and attempt to update steal_work_range
+        //   from: [ w.first , w.second )
+        //   to:   [ w.first , w.second - 1 ) = w_new
+        //
+        // If w is invalid then is just a query.
+
+        const pair_int_t w_new( w.first , w.second - 1 );
+
+        w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
+
+        if ( w.first < w.second ) {
+          // steal_work_range is viable
+
+          // If steal is successful then don't repeat attempt to steal
+          attempt = ! ( w_new.first  == w.first &&
+                        w_new.second == w.second - 1 );
+        }
+        else {
+          // steal_work_range is not viable, move to next member
+          w.first  = -1 ;
+          w.second = -1 ;
+
+          // We need to figure out whether the next team is active
+          // m_steal_rank + m_team_alloc could be the next base_rank to steal from
+          // but only if there are another m_team_size threads available so that that
+          // base rank has a full team.
+          m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
+                         m_steal_rank + m_team_alloc : 0;
+
+          steal_range = & ( pool[ m_steal_rank ]->m_work_range );
+
+          // If tried all other members then don't repeat attempt to steal
+          attempt = m_steal_rank != m_pool_rank ;
+        }
+      }
+
+      if ( w.first != -1 ) w.first = w.second - 1 ;
+    }
+
+    if ( 1 < m_team_size ) {
+      // Must share the work index
+      *((int volatile *) team_reduce()) = w.first ;
+
+      team_rendezvous_release();
+    }
+  }
+  else if ( 1 < m_team_size ) {
+    w.first = *((int volatile *) team_reduce());
+  }
+
+  // May exit because successfully stole work and w is good.
+  // May exit because no work left to steal and w = (-1,-1).
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
+       , m_pool_rank , m_pool_size , w.first );
+fflush(stdout);
+#endif
+
+  return w.first ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b5918eaefc2ee74e951b8caabdeb0d4e8c488c0
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -0,0 +1,1090 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP
+#define KOKKOS_IMPL_HOSTTHREADTEAM_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Reducer.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class HostExecSpace >
+class HostThreadTeamMember ;
+
+class HostThreadTeamData {
+public:
+
+  template< class > friend class HostThreadTeamMember ;
+
+  // Assume upper bounds on number of threads:
+  //   pool size       <= 1024 threads
+  //   pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052
+  //   team size       <= 64 threads
+  //   team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36
+
+  enum : int { max_pool_members  = 1024 };
+  enum : int { max_team_members  = 64 };
+  enum : int { max_pool_rendezvous  = ( max_pool_members / 8 ) * 4 + 4 };
+  enum : int { max_team_rendezvous  = ( max_team_members / 8 ) * 4 + 4 };
+
+private:
+
+  // per-thread scratch memory buffer chunks:
+  //
+  //   [ pool_members ]     = [ m_pool_members    .. m_pool_rendezvous )
+  //   [ pool_rendezvous ]  = [ m_pool_rendezvous .. m_team_rendezvous )
+  //   [ team_rendezvous ]  = [ m_team_rendezvous .. m_pool_reduce )
+  //   [ pool_reduce ]      = [ m_pool_reduce     .. m_team_reduce )
+  //   [ team_reduce ]      = [ m_team_reduce     .. m_team_shared )
+  //   [ team_shared ]      = [ m_team_shared     .. m_thread_local )
+  //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
+
+  enum : int { m_pool_members    = 0 };
+  enum : int { m_pool_rendezvous = m_pool_members    + max_pool_members };
+  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
+  enum : int { m_pool_reduce     = m_team_rendezvous + max_team_rendezvous };
+
+  using pair_int_t = Kokkos::pair<int,int> ;
+
+  pair_int_t  m_work_range ;
+  int64_t     m_work_end ;
+  int64_t   * m_scratch ;       // per-thread buffer
+  int64_t   * m_pool_scratch ;  // == pool[0]->m_scratch
+  int64_t   * m_team_scratch ;  // == pool[ 0 + m_team_base ]->m_scratch
+  int         m_pool_rank ;
+  int         m_pool_size ;
+  int         m_team_reduce ;
+  int         m_team_shared ;
+  int         m_thread_local ;
+  int         m_scratch_size ;
+  int         m_team_base ;
+  int         m_team_rank ;
+  int         m_team_size ;
+  int         m_team_alloc ;
+  int         m_league_rank ;
+  int         m_league_size ;
+  int         m_work_chunk ;
+  int         m_steal_rank ; // work stealing rank
+  int mutable m_pool_rendezvous_step ;
+  int mutable m_team_rendezvous_step ;
+
+  HostThreadTeamData * team_member( int r ) const noexcept
+    { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[m_team_base+r]; }
+
+  // Rendezvous pattern:
+  //   if ( rendezvous(root) ) {
+  //     ... only root thread here while all others wait ...
+  //     rendezvous_release();
+  //   }
+  //   else {
+  //     ... all other threads release here ...
+  //   }
+  //
+  // Requires: buffer[ ( max_threads / 8 ) * 4 + 4 ]; 0 == max_threads % 8
+  //
+  static
+  int rendezvous( int64_t * const buffer
+                , int & rendezvous_step
+                , int const size
+                , int const rank ) noexcept ;
+
+  static
+  void rendezvous_release( int64_t * const buffer
+                         , int const rendezvous_step ) noexcept ;
+
+public:
+
+  inline
+  int team_rendezvous( int const root ) const noexcept
+    {
+      return 1 == m_team_size ? 1 :
+             rendezvous( m_team_scratch + m_team_rendezvous
+                       , m_team_rendezvous_step
+                       , m_team_size
+                       , ( m_team_rank + m_team_size - root ) % m_team_size );
+    }
+
+  inline
+  int team_rendezvous() const noexcept
+    {
+      return 1 == m_team_size ? 1 :
+             rendezvous( m_team_scratch + m_team_rendezvous
+                       , m_team_rendezvous_step
+                       , m_team_size
+                       , m_team_rank );
+    }
+
+  inline
+  void team_rendezvous_release() const noexcept
+    {
+      if ( 1 < m_team_size ) {
+        rendezvous_release( m_team_scratch + m_team_rendezvous
+                          , m_team_rendezvous_step );
+      }
+    }
+
+  inline
+  int pool_rendezvous() const noexcept
+    {
+      return 1 == m_pool_size ? 1 :
+             rendezvous( m_pool_scratch + m_pool_rendezvous
+                       , m_pool_rendezvous_step
+                       , m_pool_size
+                       , m_pool_rank );
+    }
+
+  inline
+  void pool_rendezvous_release() const noexcept
+    {
+      if ( 1 < m_pool_size ) {
+        rendezvous_release( m_pool_scratch + m_pool_rendezvous
+                          , m_pool_rendezvous_step );
+      }
+    }
+
+  //----------------------------------------
+
+  constexpr HostThreadTeamData() noexcept
+    : m_work_range(-1,-1)
+    , m_work_end(0)
+    , m_scratch(0)
+    , m_pool_scratch(0)
+    , m_team_scratch(0)
+    , m_pool_rank(0)
+    , m_pool_size(1)
+    , m_team_reduce(0)
+    , m_team_shared(0)
+    , m_thread_local(0)
+    , m_scratch_size(0)
+    , m_team_base(0)
+    , m_team_rank(0)
+    , m_team_size(1)
+    , m_team_alloc(1)
+    , m_league_rank(0)
+    , m_league_size(1)
+    , m_work_chunk(0)
+    , m_steal_rank(0)
+    , m_pool_rendezvous_step(0)
+    , m_team_rendezvous_step(0)
+    {}
+
+  //----------------------------------------
+  // Organize array of members into a pool.
+  // The 0th member is the root of the pool.
+  // Requires: members are not already in a pool.
+  // Requires: called by one thread.
+  // Pool members are ordered as "close" - sorted by NUMA and then CORE
+  // Each thread is its own team with team_size == 1.
+  static void organize_pool( HostThreadTeamData * members[]
+                           , const int size );
+
+  // Called by each thread within the pool
+  void disband_pool();
+
+  //----------------------------------------
+  // Each thread within a pool organizes itself into a team.
+  // Must be called by all threads of the pool.
+  // Organizing threads into a team performs a barrier across the
+  // entire pool to insure proper initialization of the team
+  // rendezvous mechanism before a team rendezvous can be performed.
+  //
+  // Return true  if a valid member of a team.
+  // Return false if not a member and thread should be idled.
+  int organize_team( const int team_size );
+
+  // Each thread within a pool disbands itself from current team.
+  // Each thread becomes its own team with team_size == 1.
+  // Must be called by all threads of the pool.
+  void disband_team();
+
+  //----------------------------------------
+
+  constexpr int pool_rank() const { return m_pool_rank ; }
+  constexpr int pool_size() const { return m_pool_size ; }
+
+  HostThreadTeamData * pool_member( int r ) const noexcept
+    { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[r]; }
+
+  //----------------------------------------
+
+private:
+
+  enum : int { mask_to_16 = 0x0f }; // align to 16 bytes
+  enum : int { shift_to_8 = 3 };    // size to 8 bytes
+
+public:
+
+  static constexpr int align_to_int64( int n )
+    { return ( ( n + mask_to_16 ) & ~mask_to_16 ) >> shift_to_8 ; }
+
+  constexpr int pool_reduce_bytes() const
+    { return m_scratch_size ? sizeof(int64_t) * ( m_team_reduce - m_pool_reduce ) : 0 ; }
+
+  constexpr int team_reduce_bytes() const
+    { return sizeof(int64_t) * ( m_team_shared - m_team_reduce ); }
+
+  constexpr int team_shared_bytes() const
+    { return sizeof(int64_t) * ( m_thread_local - m_team_shared ); }
+
+  constexpr int thread_local_bytes() const
+    { return sizeof(int64_t) * ( m_scratch_size - m_thread_local ); }
+
+  constexpr int scratch_bytes() const
+    { return sizeof(int64_t) * m_scratch_size ; }
+
+  // Memory chunks:
+
+  int64_t * scratch_buffer() const noexcept
+    { return m_scratch ; }
+
+  int64_t * pool_reduce() const noexcept
+    { return m_pool_scratch + m_pool_reduce ; }
+
+  int64_t * pool_reduce_local() const noexcept
+    { return m_scratch + m_pool_reduce ; }
+
+  int64_t * team_reduce() const noexcept
+    { return m_team_scratch + m_team_reduce ; }
+
+  int64_t * team_reduce_local() const noexcept
+    { return m_scratch + m_team_reduce ; }
+
+  int64_t * team_shared() const noexcept
+    { return m_team_scratch + m_team_shared ; }
+
+  int64_t * local_scratch() const noexcept
+    { return m_scratch + m_thread_local ; }
+
+  // Given:
+  //   pool_reduce_size  = number bytes for pool reduce
+  //   team_reduce_size  = number bytes for team reduce
+  //   team_shared_size  = number bytes for team shared memory
+  //   thread_local_size = number bytes for thread local memory
+  // Return:
+  //   total number of bytes that must be allocated
+  static
+  size_t scratch_size( int pool_reduce_size
+                     , int team_reduce_size
+                     , int team_shared_size
+                     , int thread_local_size )
+    {
+      pool_reduce_size  = align_to_int64( pool_reduce_size );
+      team_reduce_size  = align_to_int64( team_reduce_size );
+      team_shared_size  = align_to_int64( team_shared_size );
+      thread_local_size = align_to_int64( thread_local_size );
+
+      const size_t total_bytes = (
+        m_pool_reduce +
+        pool_reduce_size +
+        team_reduce_size +
+        team_shared_size +
+        thread_local_size ) * sizeof(int64_t);
+
+      return total_bytes ;
+    }
+
+  // Given:
+  //   alloc_ptr         = pointer to allocated memory
+  //   alloc_size        = number bytes of allocated memory
+  //   pool_reduce_size  = number bytes for pool reduce/scan operations
+  //   team_reduce_size  = number bytes for team reduce/scan operations
+  //   team_shared_size  = number bytes for team-shared memory
+  //   thread_local_size = number bytes for thread-local memory
+  // Return:
+  //   total number of bytes that must be allocated
+  void scratch_assign( void * const alloc_ptr
+                     , size_t const alloc_size
+                     , int pool_reduce_size
+                     , int team_reduce_size
+                     , int team_shared_size
+                     , int /* thread_local_size */ )
+    {
+      pool_reduce_size  = align_to_int64( pool_reduce_size );
+      team_reduce_size  = align_to_int64( team_reduce_size );
+      team_shared_size  = align_to_int64( team_shared_size );
+      // thread_local_size = align_to_int64( thread_local_size );
+
+      m_scratch      = (int64_t *) alloc_ptr ;
+      m_team_reduce  = m_pool_reduce + pool_reduce_size ;
+      m_team_shared  = m_team_reduce + team_reduce_size ;
+      m_thread_local = m_team_shared + team_shared_size ;
+      m_scratch_size = align_to_int64( alloc_size );
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
+       , int(m_pool_members)
+       , int(m_pool_rendezvous)
+       , int(m_pool_reduce)
+       , int(m_team_reduce)
+       , int(m_team_shared)
+       , int(m_thread_local)
+       , int(m_scratch_size)
+       );
+fflush(stdout);
+#endif
+
+    }
+
+  //----------------------------------------
+  // Get a work index within the range.
+  // First try to steal from beginning of own teams's partition.
+  // If that fails then try to steal from end of another teams' partition.
+  int get_work_stealing() noexcept ;
+
+  //----------------------------------------
+  // Set the initial work partitioning of [ 0 .. length ) among the teams
+  // with granularity of chunk
+
+  void set_work_partition( int64_t const length
+                         , int     const chunk ) noexcept
+    {
+      // Minimum chunk size to insure that
+      //   m_work_end < std::numeric_limits<int>::max() * m_work_chunk
+
+      int const chunk_min = ( length + std::numeric_limits<int>::max() )
+                            / std::numeric_limits<int>::max();
+
+      m_work_end   = length ;
+      m_work_chunk = std::max( chunk , chunk_min );
+
+      // Number of work chunks and partitioning of that number:
+      int const num  = ( m_work_end + m_work_chunk - 1 ) / m_work_chunk ;
+      int const part = ( num + m_league_size - 1 ) / m_league_size ;
+
+      m_work_range.first  = part * m_league_rank ;
+      m_work_range.second = m_work_range.first + part ;
+
+      // Steal from next team, round robin
+      // The next team is offset by m_team_alloc if it fits in the pool.
+
+      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? 
+                     m_team_base + m_team_alloc : 0 ;
+    }
+
+  std::pair<int64_t,int64_t> get_work_partition() noexcept
+    {
+      return std::pair<int64_t,int64_t>
+        ( m_work_range.first * m_work_chunk
+        , m_work_range.second * m_work_chunk < m_work_end
+        ? m_work_range.second * m_work_chunk : m_work_end );
+    }
+
+  std::pair<int64_t,int64_t> get_work_stealing_chunk() noexcept
+    {
+      std::pair<int64_t,int64_t> x(-1,-1);
+
+      const int i = get_work_stealing();
+
+      if ( 0 <= i ) {
+        x.first  = m_work_chunk * i ;
+        x.second = x.first + m_work_chunk < m_work_end
+                 ? x.first + m_work_chunk : m_work_end ;
+      }
+
+      return x ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class HostExecSpace >
+class HostThreadTeamMember {
+public:
+
+  using scratch_memory_space = typename HostExecSpace::scratch_memory_space ;
+
+private:
+
+  scratch_memory_space m_scratch ;
+  HostThreadTeamData & m_data ;
+  int const            m_league_rank ;
+  int const            m_league_size ;
+
+public:
+
+  constexpr HostThreadTeamMember( HostThreadTeamData & arg_data ) noexcept
+    : m_scratch( arg_data.team_shared() , arg_data.team_shared_bytes() )
+    , m_data( arg_data )
+    , m_league_rank(0)
+    , m_league_size(1)
+    {}
+
+  constexpr HostThreadTeamMember( HostThreadTeamData & arg_data
+                                , int const            arg_league_rank
+                                , int const            arg_league_size
+                                ) noexcept
+    : m_scratch( arg_data.team_shared()
+               , arg_data.team_shared_bytes()
+               , arg_data.team_shared()
+               , arg_data.team_shared_bytes() )
+    , m_data( arg_data )
+    , m_league_rank( arg_league_rank )
+    , m_league_size( arg_league_size )
+    {}
+
+  ~HostThreadTeamMember() = default ;
+  HostThreadTeamMember() = delete ;
+  HostThreadTeamMember( HostThreadTeamMember && ) = default ;
+  HostThreadTeamMember( HostThreadTeamMember const & ) = default ;
+  HostThreadTeamMember & operator = ( HostThreadTeamMember && ) = default ;
+  HostThreadTeamMember & operator = ( HostThreadTeamMember const & ) = default ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const noexcept { return m_data.m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const noexcept { return m_data.m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int league_rank() const noexcept { return m_league_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int league_size() const noexcept { return m_league_size ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const
+    { return m_scratch.set_team_thread_mode(0,1,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_scratch(int) const
+    { return m_scratch.set_team_thread_mode(0,1,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & thread_scratch(int) const
+    { return m_scratch.set_team_thread_mode(0,m_data.m_team_size,m_data.m_team_rank); }
+
+  //----------------------------------------
+  // Team collectives
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( m_data.team_rendezvous() ) m_data.team_rendezvous_release();
+    }
+#else
+    {}
+#endif
+
+  template< class Closure >
+  KOKKOS_INLINE_FUNCTION
+  void team_barrier( Closure const & f ) const noexcept
+    {
+      if ( m_data.team_rendezvous() ) {
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+
+        f();
+
+        m_data.team_rendezvous_release();
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( T & value , const int source_team_rank ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 1 < m_data.m_team_size ) {
+        T volatile * const shared_value = (T*) m_data.team_reduce();
+
+        // Don't overwrite shared memory until all threads arrive
+
+        if ( m_data.team_rendezvous( source_team_rank ) ) {
+          // All threads have entered 'team_rendezvous'
+          // only this thread returned from 'team_rendezvous'
+          // with a return value of 'true'
+
+          *shared_value = value ;
+
+          m_data.team_rendezvous_release();
+          // This thread released all other threads from 'team_rendezvous'
+          // with a return value of 'false'
+        }
+        else {
+          value = *shared_value ;
+        }
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_broadcast\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  template< class Closure , typename T >
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( Closure const & f , T & value , const int source_team_rank) const noexcept
+    {
+      T volatile * const shared_value = (T*) m_data.team_reduce();
+
+      // Don't overwrite shared memory until all threads arrive
+
+      if ( m_data.team_rendezvous(source_team_rank) ) {
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+
+        f( value );
+
+        if ( 1 < m_data.m_team_size ) { *shared_value = value ; }
+
+        m_data.team_rendezvous_release();
+        // This thread released all other threads from 'team_rendezvous'
+        // with a return value of 'false'
+      }
+      else {
+        value = *shared_value ;
+      }
+    }
+
+  //--------------------------------------------------------------------------
+  // team_reduce( Sum(result) );
+  // team_reduce( Min(result) );
+  // team_reduce( Max(result) );
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  team_reduce( ReducerType const & reducer ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 1 < m_data.m_team_size ) {
+
+        using value_type = typename ReducerType::value_type ;
+
+        if ( 0 != m_data.m_team_rank ) {
+          // Non-root copies to their local buffer:
+          reducer.copy( (value_type*) m_data.team_reduce_local()
+                      , reducer.data() );
+        }
+
+        // Root does not overwrite shared memory until all threads arrive
+        // and copy to their local buffer.
+
+        if ( m_data.team_rendezvous() ) {
+          // All threads have entered 'team_rendezvous'
+          // only this thread returned from 'team_rendezvous'
+          // with a return value of 'true'
+          //
+          // This thread sums contributed values
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            value_type * const src =
+              (value_type*) m_data.team_member(i)->team_reduce_local();
+
+            reducer.join( reducer.data() , src );
+          }
+
+          // Copy result to root member's buffer:
+          reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
+
+          m_data.team_rendezvous_release();
+          // This thread released all other threads from 'team_rendezvous'
+          // with a return value of 'false'
+        }
+        else {
+          // Copy from root member's buffer:
+          reducer.copy( reducer.data() , (value_type*) m_data.team_reduce() );
+        }
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  template< typename ValueType , class JoinOp >
+  KOKKOS_INLINE_FUNCTION
+  ValueType
+  team_reduce( ValueType const & value
+             , JoinOp    const & join ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 0 != m_data.m_team_rank ) {
+        // Non-root copies to their local buffer:
+        *((ValueType*) m_data.team_reduce_local()) = value ;
+      }
+
+      // Root does not overwrite shared memory until all threads arrive
+      // and copy to their local buffer.
+
+      if ( m_data.team_rendezvous() ) {
+        const Impl::Reducer< ValueType , JoinOp > reducer( join );
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+        //
+        // This thread sums contributed values
+
+        ValueType * const dst = (ValueType*) m_data.team_reduce_local();
+
+        *dst = value ;
+
+        for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+          ValueType * const src =
+            (ValueType*) m_data.team_member(i)->team_reduce_local();
+
+          reducer.join( dst , src );
+        }
+
+        m_data.team_rendezvous_release();
+        // This thread released all other threads from 'team_rendezvous'
+        // with a return value of 'false'
+      }
+
+      return *((ValueType*) m_data.team_reduce());
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
+#endif
+
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  T team_scan( T const & value , T * const global = 0 ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 0 != m_data.m_team_rank ) {
+        // Non-root copies to their local buffer:
+        ((T*) m_data.team_reduce_local())[1] = value ;
+      }
+
+      // Root does not overwrite shared memory until all threads arrive
+      // and copy to their local buffer.
+
+      if ( m_data.team_rendezvous() ) {
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+        //
+        // This thread scans contributed values
+
+        {
+          T * prev = (T*) m_data.team_reduce_local();
+
+          prev[0] = 0 ;
+          prev[1] = value ;
+
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            T * const ptr = (T*) m_data.team_member(i)->team_reduce_local();
+
+            ptr[0] = prev[0] + prev[1] ;
+
+            prev = ptr ;
+          }
+        }
+
+        // If adding to global value then atomic_fetch_add to that value
+        // and sum previous value to every entry of the scan.
+        if ( global ) {
+          T * prev = (T*) m_data.team_reduce_local();
+
+          {
+            T * ptr  = (T*) m_data.team_member( m_data.m_team_size - 1 )->team_reduce_local();
+            prev[0] = Kokkos::atomic_fetch_add( global , ptr[0] + ptr[1] );
+          }
+
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            T * ptr = (T*) m_data.team_member(i)->team_reduce_local();
+            ptr[0] += prev[0] ;
+          }
+        }
+
+        m_data.team_rendezvous_release();
+      }
+
+      return ((T*) m_data.team_reduce_local())[0];
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_scan\n"); return T(); }
+#endif
+
+};
+
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<class Space,typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+TeamThreadRange( Impl::HostThreadTeamMember<Space> const & member
+               , iType const & count )
+{
+  return
+    Impl::TeamThreadRangeBoundariesStruct
+      <iType,Impl::HostThreadTeamMember<Space> >(member,0,count);
+}
+
+template<class Space, typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct
+  < typename std::common_type< iType1, iType2 >::type
+  , Impl::HostThreadTeamMember<Space> >
+TeamThreadRange( Impl::HostThreadTeamMember<Space> const & member
+               , iType1 const & begin , iType2 const & end )
+{
+  return
+    Impl::TeamThreadRangeBoundariesStruct
+      < typename std::common_type< iType1, iType2 >::type
+      , Impl::HostThreadTeamMember<Space> >( member , begin , end );
+}
+
+template<class Space, typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+ThreadVectorRange
+  ( Impl::HostThreadTeamMember<Space> const & member
+  , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >(member,count);
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Inter-thread parallel_for.
+ *
+ * Executes lambda(iType i) for each i=[0..N)
+ *
+ * The range [0..N) is mapped to all threads of the the calling thread team.
+*/
+template<typename iType, class Space, class Closure>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure (i);
+  }
+}
+
+template<typename iType, class Space, class Closure>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure (i);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename iType, class Space, class Closure, class Reducer >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< Reducer >::value >::type
+parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , Reducer  const & reducer
+  )
+{
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+template< typename iType, class Space, typename Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
+parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , ValueType      & result
+  )
+{
+  Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > > reducer( & result );
+
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+template< typename iType, class Space
+         , class Closure, class Joiner , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , Joiner   const & joiner
+  , ValueType      & result
+  )
+{
+  Impl::Reducer< ValueType , Joiner > reducer( joiner , & result );
+
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Inter-thread vector parallel_reduce.
+ *
+ *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of  val is
+ *  performed and put into result.
+ */
+template< typename iType, class Space , class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& result)
+{
+  result = ValueType();
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment) {
+    lambda(i,result);
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes of the the
+ *  calling thread and a reduction of val is performed using
+ *  JoinType(ValueType& val, const ValueType& update)
+ *  and put into init_result.
+ *  The input value of init_result is used as initializer for
+ *  temporary variables of ValueType. Therefore * the input
+ *  value should be the neutral element with respect to the
+ *  join operation (e.g. '0 for +-' or * '1 for *').
+ */
+template< typename iType, class Space
+        , class Lambda, class JoinType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& result)
+{
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment ) {
+    lambda(i,result);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename iType, class Space, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  // Extract ValueType from the closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+
+  // Intra-member scan
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,accum,false);
+  }
+
+  // 'accum' output is the exclusive prefix sum
+  accum = loop_boundaries.thread.team_scan(accum);
+
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,accum,true);
+  }
+}
+
+
+template< typename iType, class Space, class ClosureType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , ClosureType const & closure
+  )
+{
+  using value_type = typename
+    Kokkos::Impl::FunctorAnalysis
+      < Impl::FunctorPatternInterface::SCAN
+      , void
+      , ClosureType >::value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,scan_val,true);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::HostThreadTeamMember<Space> >
+PerTeam(const Impl::HostThreadTeamMember<Space> & member )
+{
+  return Impl::ThreadSingleStruct<Impl::HostThreadTeamMember<Space> >(member);
+}
+
+template< class Space >
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::HostThreadTeamMember<Space> >
+PerThread(const Impl::HostThreadTeamMember<Space> & member)
+{
+  return Impl::VectorSingleStruct<Impl::HostThreadTeamMember<Space> >(member);
+}
+
+template< class Space , class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor )
+{
+  if ( single.team_member.team_rank() == 0 ) functor();
+  // 'single' does not perform a barrier.
+  // single.team_member.team_barrier( functor );
+}
+
+template< class Space , class FunctorType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor , ValueType & val )
+{
+  single.team_member.team_broadcast( functor , val , 0 );
+}
+
+template< class Space , class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::VectorSingleStruct< Impl::HostThreadTeamMember<Space> > & , const FunctorType & functor )
+{
+  functor();
+}
+
+template< class Space , class FunctorType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::VectorSingleStruct< Impl::HostThreadTeamMember<Space> > & , const FunctorType & functor , ValueType & val )
+{
+  functor(val);
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 84cf536bb7adf86be20459f36f64f4ced027188e..7489018ac641b70e97b6eba879d4c08aa0776fb9 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -52,6 +52,10 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
   __threadfence();
+#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
+  asm volatile (
+	  "mfence" ::: "memory"
+  );
 #elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
       ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
   __sync_synchronize();
@@ -76,8 +80,8 @@ void store_fence()
 {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
   asm volatile (
-	"sfence" ::: "memory"
-  	);
+	  "sfence" ::: "memory"
+  );
 #else
   memory_fence();
 #endif
@@ -93,8 +97,8 @@ void load_fence()
 {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
   asm volatile (
-	"lfence" ::: "memory"
-  	);
+	  "lfence" ::: "memory"
+  );
 #else
   memory_fence();
 #endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
index da95c943fe96acbeda0a8d44525f9f9fd2d65076..5852efb011f357ace9df66c5d330f9e2a3f39dd1 100644
--- a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@@ -129,8 +129,8 @@
 #endif
 
 #ifdef KOKKOS_HAVE_CUDA_RDC
-#ifndef KOKKOS_ENABLE_CUDA_RDC
-#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
+#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
 #endif
 #endif
 
@@ -242,9 +242,9 @@
 #endif
 #endif
 
-#ifdef KOKKOS_HAVE_QTHREAD
-#ifndef KOKKOS_ENABLE_QTHREAD
-#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
+#ifdef KOKKOS_HAVE_QTHREADS
+#ifndef KOKKOS_ENABLE_QTHREADS
+#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
 #endif
 #endif
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
index 99c5df4db31001b42f56337938f5a7ea73941157..0c006a8c008390e330f35d849f9b93facfeb1879 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -43,7 +43,7 @@
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <string.h>
 
 namespace Kokkos {
@@ -84,21 +84,21 @@ namespace Kokkos {
             (*endScanCallee)(kernelID);
         }
     }
-    
+
     void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
         if(NULL != beginReduceCallee) {
             Kokkos::fence();
             (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
         }
     }
-    
+
     void endParallelReduce(const uint64_t kernelID) {
         if(NULL != endReduceCallee) {
             Kokkos::fence();
             (*endReduceCallee)(kernelID);
         }
     }
-    
+
 
     void pushRegion(const std::string& kName) {
       if( NULL != pushRegionCallee ) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 3d6a3892524ee3234a33f14cf7727cac5512e455..139a20d8f9ea99b88d21436726fa9c55fe063622 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -50,7 +50,7 @@
 #include <string>
 #include <cinttypes>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_DeviceInfo.hpp>
 #include <dlfcn.h>
 #include <iostream>
@@ -59,7 +59,7 @@
 
 #define KOKKOSP_INTERFACE_VERSION 20150628
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 namespace Kokkos {
   namespace Profiling {
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3ed5f151439c659305773f1cd997376300ccf3e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
@@ -0,0 +1,317 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_REDUCER_HPP
+#define KOKKOS_IMPL_REDUCER_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+/*  Reducer abstraction:
+ *  1) Provides 'join' operation
+ *  2) Provides 'init' operation
+ *  3) Provides 'copy' operation
+ *  4) Optionally provides result value in a memory space
+ *
+ *  Created from:
+ *  1) Functor::operator()( destination , source )
+ *  2) Functor::{ join , init )
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename value_type >
+struct ReduceSum
+{
+  KOKKOS_INLINE_FUNCTION static
+  void copy( value_type & dest
+           , value_type const & src ) noexcept
+    { dest = src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void init( value_type & dest ) noexcept
+    { new( &dest ) value_type(); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type volatile & dest
+           , value_type const volatile & src ) noexcept
+    { dest += src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type & dest
+           , value_type const & src ) noexcept
+    { dest += src ; }
+};
+
+template< typename T
+        , class ReduceOp = ReduceSum< T >
+        , typename MemorySpace = void >
+struct Reducer
+  : private ReduceOp
+  , private integral_nonzero_constant
+    < int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
+{
+private:
+
+  // Determine if T is simple array
+
+  enum : int { rank = std::rank<T>::value };
+
+  static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
+
+  using length_t =
+    integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
+
+public:
+
+  using reducer        = Reducer ;
+  using memory_space   = MemorySpace ;
+  using value_type     = typename std::remove_extent<T>::type ;
+  using reference_type =
+    typename std::conditional< ( rank != 0 )
+                             , value_type *
+                             , value_type &
+                             >::type ;
+private:
+
+  //--------------------------------------------------------------------------
+  // Determine what functions 'ReduceOp' provides:
+  //   copy( destination , source )
+  //   init( destination )
+  //
+  //   operator()( destination , source )
+  //   join( destination , source )
+  //
+  // Provide defaults for missing optional operations
+
+  template< class R , typename = void>
+  struct COPY {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const &
+             , value_type * dst
+             , value_type const * src ) { *dst = *src ; }
+  };
+
+  template< class R >
+  struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
+                                          , *((value_type const *)0) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const & r
+             , value_type * dst
+             , value_type const * src ) { r.copy( *dst , *src ); }
+  };
+
+  template< class R , typename = void >
+  struct INIT {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & , value_type * dst ) { new(dst) value_type(); }
+  };
+
+  template< class R >
+  struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & r , value_type * dst ) { r.init( *dst ); }
+  };
+
+  template< class R , typename V , typename = void > struct JOIN
+    {
+      // If no join function then try operator()
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.operator()(*dst,*src); }
+    };
+
+  template< class R , typename V >
+  struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
+    {
+      // If has join function use it
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.join(*dst,*src); }
+    };
+
+  //--------------------------------------------------------------------------
+
+  value_type * const m_result ;
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 != Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return p ; }
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 == Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return *p ; }
+
+public:
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr int length() const noexcept
+     { return length_t::value ; }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type * data() const noexcept
+    { return m_result ; }
+
+  KOKKOS_INLINE_FUNCTION
+  reference_type reference() const noexcept
+    { return Reducer::template ref< rank >( m_result ); }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void copy( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type * dest ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile * const dest
+           , value_type volatile const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  template< typename ArgT >
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer
+    ( ArgT * arg_value
+    , typename std::enable_if
+        < std::is_same<ArgT,value_type>::value &&
+          std::is_default_constructible< ReduceOp >::value
+        , int >::type arg_length = 1
+    ) noexcept
+    : ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp const & arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp      && arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  Reducer( Reducer const & ) = default ;
+  Reducer( Reducer && ) = default ;
+  Reducer & operator = ( Reducer const & ) = default ;
+  Reducer & operator = ( Reducer && ) = default ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
+Sum( ValueType & arg_value )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
+}
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
+Sum( ValueType * arg_value , int arg_length )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ValueType , class JoinType >
+Impl::Reducer< ValueType , JoinType >
+reducer( ValueType & value , JoinType const & lambda )
+{
+  return Impl::Reducer< ValueType , JoinType >( lambda , & value );
+}
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
index 76161c10f1a8b4ed493772a59e086362b9e2723c..79496133061145aee8786aecb21aa86117b1dbc4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -53,63 +53,126 @@
 
 namespace Kokkos {
 namespace Impl {
-namespace SerialImpl {
+namespace {
 
-Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+HostThreadTeamData g_serial_thread_team_data ;
 
-Sentinel::~Sentinel()
-{
-  if ( m_scratch ) { free( m_scratch ); }
-  m_scratch = 0 ;
-  m_reduce_end = 0 ;
-  m_shared_end = 0 ;
 }
 
-Sentinel & Sentinel::singleton()
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  static Sentinel s ; return s ;
+  if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
+  if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
+
+  const size_t old_pool_reduce  = g_serial_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = g_serial_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = g_serial_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = g_serial_thread_team_data.scratch_bytes();
+
+  // Allocate if any of the old allocation is tool small:
+
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
+
+  if ( allocate ) {
+
+    Kokkos::HostSpace space ;
+
+    if ( old_alloc_bytes ) {
+      g_serial_thread_team_data.disband_team();
+      g_serial_thread_team_data.disband_pool();
+
+      space.deallocate( g_serial_thread_team_data.scratch_buffer()
+                      , g_serial_thread_team_data.scratch_bytes() );
+    }
+
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    void * const ptr = space.allocate( alloc_bytes );
+
+    g_serial_thread_team_data.
+      scratch_assign( ((char *)ptr)
+                    , alloc_bytes
+                    , pool_reduce_bytes
+                    , team_reduce_bytes
+                    , team_shared_bytes
+                    , thread_local_bytes );
+
+    HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
+
+    g_serial_thread_team_data.organize_pool( pool , 1 );
+    g_serial_thread_team_data.organize_team(1);
+  }
 }
 
-inline
-unsigned align( unsigned n )
+// Get thread team data structure for omp_get_thread_num()
+HostThreadTeamData * serial_get_thread_team_data()
 {
-  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
-  return ( n + MASK ) & ~MASK ;
+  return & g_serial_thread_team_data ;
 }
 
-} // namespace
+} // namespace Impl
+} // namespace Kokkos
 
-SerialTeamMember::SerialTeamMember( int arg_league_rank
-                                  , int arg_league_size
-                                  , int arg_shared_size
-                                  )
-  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
-           , arg_shared_size )
-  , m_league_rank( arg_league_rank )
-  , m_league_size( arg_league_size )
-{}
+/*--------------------------------------------------------------------------*/
 
-} // namespace Impl
+namespace Kokkos {
 
-void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+int Serial::is_initialized()
 {
-  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+  return 1 ;
+}
 
-  reduce_size = Impl::SerialImpl::align( reduce_size );
-  shared_size = Impl::SerialImpl::align( shared_size );
+void Serial::initialize( unsigned threads_count
+                       , unsigned use_numa_count
+                       , unsigned use_cores_per_numa
+                       , bool allow_asynchronous_threadpool )
+{
+  (void) threads_count;
+  (void) use_numa_count;
+  (void) use_cores_per_numa;
+  (void) allow_asynchronous_threadpool;
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
 
-  if ( ( s.m_reduce_end < reduce_size ) ||
-       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+void Serial::finalize()
+{
+  if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
+    Impl::g_serial_thread_team_data.disband_team();
+    Impl::g_serial_thread_team_data.disband_pool();
 
-    if ( s.m_scratch ) { free( s.m_scratch ); }
+    Kokkos::HostSpace space ;
 
-    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
-    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+    space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
+                    , Impl::g_serial_thread_team_data.scratch_bytes() );
 
-    s.m_scratch = malloc( s.m_shared_end );
+    Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
   }
 
-  return s.m_scratch ;
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }
 
 } // namespace Kokkos
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
index 19f3abe71ae7049ce0c2674ee2638c07679aa5b0..d22d604fbc2f02e2f18c6c24d69840e7f33e7e98 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
   using execution_space = Kokkos::Serial ;
   using queue_type      = TaskQueue< execution_space > ;
   using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );
 
   // Loop until all queues are empty
   while ( 0 < queue->m_ready_count ) {
@@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
 
     for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
       for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
       }
     }
 
     if ( end != task ) {
 
-      // pop_task resulted in lock == task->m_next
+      // pop_ready_task resulted in lock == task->m_next
       // In the executing state
 
       (*task->m_apply)( task , & exec );
@@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
   using execution_space = Kokkos::Serial ;
   using queue_type      = TaskQueue< execution_space > ;
   using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );
 
   // Loop until no runnable task
 
@@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
 
     for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
       for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
       }
     }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 178305c5d3c97da52535324a14333e1878cea730..ac7f17c0ea9e314137560626e0b0467faf5ff90d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -65,6 +65,7 @@ public:
   using memory_space    = Kokkos::HostSpace ;
   using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
   using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
 
   static
   void iff_single_thread_recursive_execute( queue_type * const );
@@ -72,237 +73,19 @@ public:
   static
   void execute( queue_type * const );
 
-  template< typename FunctorType >
+  template< typename TaskType >
   static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::Serial
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };
 
 extern template class TaskQueue< Kokkos::Serial > ;
 
-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::Serial >
-{
-public:
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
-};
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  //const  TaskExec< Kokkos::Serial > & thread;
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread
-    ( TaskExec< Kokkos::Serial > & arg_thread
-    , const iType& arg_start
-    , const iType & arg_end
-    )
-    : start( arg_start )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-//----------------------------------------------------------------------------
-
-template<typename iType>
-struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start( 0 )
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-};
-
 }} /* namespace Kokkos::Impl */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-
-// OMP version needs non-const TaskExec
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
-}
-
-// OMP version needs non-const TaskExec
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
-           thread, iType(start), iType(end) );
-}
-
-// OMP version needs non-const TaskExec
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::Serial > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
-}
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-  initialized_result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    initialized_result+=tmp;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  initialized_result = result;
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
deleted file mode 100644
index b2aea14df44ea55b8c86a70c9907792b51525918..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_HPP
-#define KOKKOS_SYNCHRONIC_HPP
-
-#include <impl/Kokkos_Synchronic_Config.hpp>
-
-#include <atomic>
-#include <chrono>
-#include <thread>
-#include <functional>
-#include <algorithm>
-
-namespace Kokkos {
-namespace Impl {
-
-enum notify_hint {
-  notify_all,
-  notify_one,
-  notify_none
-};
-enum expect_hint {
-  expect_urgent,
-  expect_delay
-};
-
-namespace Details {
-
-template <class S, class T>
-bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
-  int i = 0;
-  for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_relax();
-  for(;i < attempts; ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_yield();
-  return false;
-}
-
-struct __exponential_backoff {
-  __exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
-  }
-  static inline void sleep_for(std::chrono::microseconds const& time) {
-    auto t = time.count();
-    if(__builtin_expect(t > 75,0)) {
-      portable_sleep(time);
-    }
-    else if(__builtin_expect(t > 25,0))
-      __synchronic_yield();
-    else
-      __synchronic_relax();
-  }
-  void sleep_for_step() {
-    sleep_for(step());
-  }
-  std::chrono::microseconds step() {
-    float const f = ranfu();
-    int const t = int(microseconds * f);
-    if(__builtin_expect(f >= 0.95f,0))
-      microseconds = 8;
-    else
-      microseconds = (std::min)(microseconds>>1,maximum);
-    return std::chrono::microseconds(t);
-  }
-private :
-  int maximum, microseconds, x, y, z;
-  int xorshf96() {
-    int t;
-    x ^= x << 16; x ^= x >> 5; x ^= x << 1;
-    t = x; x = y; y = z; z = t ^ x ^ y;
-    return z;
-  }
-  float ranfu() {
-    return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
-  }
-};
-
-template <class T, class Enable = void>
-struct __synchronic_base {
-
-protected:
-  std::atomic<T> atom;
-
-  void notify(notify_hint = notify_all) noexcept {
-  }
-  void notify(notify_hint = notify_all) volatile noexcept {
-  }
-
-public :
-  __synchronic_base() noexcept = default;
-  constexpr __synchronic_base(T v) noexcept : atom(v) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-};
-
-#ifdef __SYNCHRONIC_COMPATIBLE
-template <class T>
-struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
-
-public:
-  std::atomic<T> atom;
-
-  void notify(notify_hint hint = notify_all) noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all(&atom);
-      else
-        __synchronic_wake_one(&atom);
-    }
-  }
-  void notify(notify_hint hint = notify_all) volatile noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all_volatile(&atom);
-      else
-        __synchronic_wake_one_volatile(&atom);
-    }
-  }
-
-public :
-  __synchronic_base() noexcept : count(0) { }
-  constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_volatile(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed_volatile(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-private:
-  mutable std::atomic<int> count;
-};
-#endif
-
-template <class T, class Enable = void>
-struct __synchronic : public __synchronic_base<T> {
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-};
-
-template <class T>
-struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
-
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T operator=(T v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator=(T v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) volatile noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) volatile noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) volatile noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-};
-
-template <class T>
-struct __synchronic<T*> : public __synchronic_base<T*> {
-
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T* operator=(T* v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator=(T* v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-};
-
-} //namespace Details
-
-template <class T>
-struct synchronic : public Details::__synchronic<T> {
-
-  bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
-  bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
-  T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
-
-  operator T() const volatile noexcept { return (T)this->atom; }
-  operator T() const noexcept { return (T)this->atom; }
-
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-
-  synchronic() noexcept = default;
-  constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
-  synchronic(const synchronic&) = delete;
-  ~synchronic() { }
-  synchronic& operator=(const synchronic&) = delete;
-  synchronic& operator=(const synchronic&) volatile = delete;
-  T operator=(T val) noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-  T operator=(T val) volatile noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      Details::__synchronic<T>::expect_update(nval,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      expect_update(nval,h);
-    return load(order);
-  }
-  template <class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-  template < class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-};
-
-#include <inttypes.h>
-
-typedef synchronic<char> synchronic_char;
-typedef synchronic<char> synchronic_schar;
-typedef synchronic<unsigned char> synchronic_uchar;
-typedef synchronic<short> synchronic_short;
-typedef synchronic<unsigned short> synchronic_ushort;
-typedef synchronic<int> synchronic_int;
-typedef synchronic<unsigned int> synchronic_uint;
-typedef synchronic<long> synchronic_long;
-typedef synchronic<unsigned long> synchronic_ulong;
-typedef synchronic<long long> synchronic_llong;
-typedef synchronic<unsigned long long> synchronic_ullong;
-//typedef synchronic<char16_t> synchronic_char16_t;
-//typedef synchronic<char32_t> synchronic_char32_t;
-typedef synchronic<wchar_t> synchronic_wchar_t;
-
-typedef synchronic<int_least8_t> synchronic_int_least8_t;
-typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
-typedef synchronic<int_least16_t> synchronic_int_least16_t;
-typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
-typedef synchronic<int_least32_t> synchronic_int_least32_t;
-typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
-//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
-typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
-typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
-typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
-typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
-typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
-typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
-typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
-typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
-typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
-typedef synchronic<intptr_t> synchronic_intptr_t;
-typedef synchronic<uintptr_t> synchronic_uintptr_t;
-typedef synchronic<size_t> synchronic_size_t;
-typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
-typedef synchronic<intmax_t> synchronic_intmax_t;
-typedef synchronic<uintmax_t> synchronic_uintmax_t;
-
-}
-}
-
-#endif //__SYNCHRONIC_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
deleted file mode 100644
index 0a6dd6e715edad752f56756ccdc6fba3d43e30fb..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
-#define KOKKOS_SYNCHRONIC_CONFIG_H
-
-#include <thread>
-#include <chrono>
-
-namespace Kokkos {
-namespace Impl {
-
-//the default yield function used inside the implementation is the Standard one
-#define __synchronic_yield std::this_thread::yield
-#define __synchronic_relax __synchronic_yield
-
-#if defined(_MSC_VER)
-    //this is a handy GCC optimization that I use inside the implementation
-    #define __builtin_expect(condition,common) condition
-    #if _MSC_VER <= 1800
-        //using certain keywords that VC++ temporarily doesn't support
-        #define _ALLOW_KEYWORD_MACROS
-        #define noexcept
-        #define constexpr
-    #endif
-    //yes, I define multiple assignment operators
-    #pragma warning(disable:4522)
-    //I don't understand how Windows is so bad at timing functions, but is OK
-    //with straight-up yield loops
-    #define __do_backoff(b) __synchronic_yield()
-#else
-#define __do_backoff(b) b.sleep_for_step()
-#endif
-
-//certain platforms have efficient support for spin-waiting built into the operating system
-#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
-#if defined(_WIN32_WINNT)
-#include <winsock2.h>
-#include <Windows.h>
-    //the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
-    #define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
-    #define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
-
-    inline void native_sleep(unsigned long microseconds)
-    {
-      // What to do if microseconds is < 1000?
-      Sleep(microseconds / 1000);
-    }
-
-    inline void native_yield()
-    {
-      SwitchToThread();
-    }
-#elif defined(__linux__)
-    #include <chrono>
-    #include <time.h>
-    #include <unistd.h>
-    #include <pthread.h>
-    #include <linux/futex.h>
-    #include <sys/syscall.h>
-    #include <climits>
-    #include <cassert>
-    template < class Rep, class Period>
-    inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
-      struct timespec ts;
-      ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
-      assert(!ts.tv_sec);
-      ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
-      return ts;
-    }
-    inline long futex(void const* addr1, int op, int val1) {
-        return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
-    }
-    inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
-        return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
-    }
-    inline void native_sleep(unsigned long microseconds)
-    {
-      usleep(microseconds);
-    }
-    inline void native_yield()
-    {
-      pthread_yield();
-    }
-
-    //the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
-    #define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
-    #define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
-    #define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
-    #define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
-    #define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
-    #define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
-    #define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
-    #define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
-
-    //the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
-    #undef __synchronic_yield
-    #define __synchronic_yield sched_yield
-
-    //for extremely short wait times, just let another hyper-thread run
-    #undef __synchronic_relax
-    #define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
-
-#endif
-#endif
-
-#ifdef _GLIBCXX_USE_NANOSLEEP
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ std::this_thread::sleep_for(time); }
-#else
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ native_sleep(time.count()); }
-#endif
-
-#ifdef _GLIBCXX_USE_SCHED_YIELD
-inline void portable_yield()
-{ std::this_thread::yield(); }
-#else
-inline void portable_yield()
-{ native_yield(); }
-#endif
-
-//this is the number of times we initially spin, on the first wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_A 16
-
-//this is how decide to yield instead of just spinning, 'c' is the current trip count
-//#define __SYNCHRONIC_SPIN_YIELD(c) true
-#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
-
-//this is the number of times we normally spin, on every subsequent wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_B 8
-
-}
-}
-
-#endif //__SYNCHRONIC_CONFIG_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
deleted file mode 100644
index facc8d6d8e67a4828aa94bd75fb7590f454b41f6..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
-#define KOKKOS_SYNCHRONIC_N3998_HPP
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <functional>
-
-/*
-In the section below, a synchronization point represents a point at which a
-thread may block until a given synchronization condition has been reached or
-at which it may notify other threads that a synchronization condition has
-been achieved.
-*/
-namespace Kokkos { namespace Impl {
-
-    /*
-    A latch maintains an internal counter that is initialized when the latch
-    is created. The synchronization condition is reached when the counter is
-    decremented to 0. Threads may block at a synchronization point waiting
-    for the condition to be reached. When the condition is reached, any such
-    blocked threads will be released.
-    */
-    struct latch {
-        latch(int val) : count(val), released(false) { }
-        latch(const latch&) = delete;
-        latch& operator=(const latch&) = delete;
-        ~latch( ) { }
-        void arrive( ) {
-            __arrive( );
-        }
-        void arrive_and_wait( ) {
-            if(!__arrive( ))
-                wait( );
-        }
-        void wait( ) {
-            while(!released.load_when_not_equal(false,std::memory_order_acquire))
-                ;
-        }
-        bool try_wait( ) {
-            return released.load(std::memory_order_acquire);
-        }
-    private:
-        bool __arrive( ) {
-            if(count.fetch_add(-1,std::memory_order_release)!=1)
-                return false;
-            released.store(true,std::memory_order_release);
-            return true;
-        }
-        std::atomic<int> count;
-        synchronic<bool> released;
-    };
-
-    /*
-    A barrier is created with an initial value representing the number of threads
-    that can arrive at the synchronization point. When that many threads have
-    arrived, the  synchronization condition is reached and the threads are
-    released. The barrier will then reset, and may be reused for a new cycle, in
-    which the same set of threads may arrive again at the synchronization point.
-    The same set of threads shall arrive at the barrier in each cycle, otherwise
-    the behaviour is undefined.
-    */
-    struct barrier {
-        barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
-        barrier(const barrier&) = delete;
-        barrier& operator=(const barrier&) = delete;
-        ~barrier() { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                expected = nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-    };
-
-    /*
-    A notifying barrier behaves as a barrier, but is constructed with a callable
-    completion function that is invoked after all threads have arrived at the
-    synchronization point, and before the synchronization condition is reached.
-    The completion may modify the set of threads that arrives at the barrier in
-    each cycle.
-    */
-    struct notifying_barrier {
-        template <typename T>
-        notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
-        notifying_barrier(const notifying_barrier&) = delete;
-        notifying_barrier& operator=(const notifying_barrier&) = delete;
-        ~notifying_barrier( ) { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                int const newexpected = completion();
-                expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-        std::function<int()> completion;
-    };
-}}
-
-#endif //__N3998_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index afa01d0cde1f1253f216c415b81bf5c8fee1de2b..b514df351725ac55e88ea1c2e92eec4b1711e6b4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -76,9 +76,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;
 
-template< typename Space >
-class TaskExec ;
-
 } /* namespace Impl */
 } /* namespace Kokkos */
 
@@ -149,8 +146,8 @@ private:
   //     task->m_next is the dependence or zero
   //   Postcondition:
   //     task->m_next is linked list membership
-  KOKKOS_FUNCTION
-  void schedule( task_root_type * const );
+  KOKKOS_FUNCTION void schedule_runnable(  task_root_type * const );
+  KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );
 
   // Reschedule a task
   //   Precondition:
@@ -178,7 +175,7 @@ private:
                        , task_root_type * const );
 
   KOKKOS_FUNCTION
-  static task_root_type * pop_task( task_root_type * volatile * const );
+  static task_root_type * pop_ready_task( task_root_type * volatile * const );
 
   KOKKOS_FUNCTION static
   void decrement( task_root_type * task );
@@ -368,6 +365,7 @@ public:
   int16_t        m_task_type ;   ///< Type of task
   int16_t        m_priority ;    ///< Priority of runnable task
 
+  TaskBase() = delete ;
   TaskBase( TaskBase && ) = delete ;
   TaskBase( const TaskBase & ) = delete ;
   TaskBase & operator = ( TaskBase && ) = delete ;
@@ -375,17 +373,43 @@ public:
 
   KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
 
+  // Constructor for a runnable task
   KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase() noexcept
-    : m_apply(0)
-    , m_queue(0)
-    , m_wait(0)
-    , m_next(0)
-    , m_ref_count(0)
-    , m_alloc_size(0)
-    , m_dep_count(0)
-    , m_task_type( TaskSingle )
-    , m_priority( 1 /* TaskRegularPriority */ )
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , TaskBase    * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    ) noexcept
+    : m_apply(      arg_apply )
+    , m_queue(      arg_queue )
+    , m_wait( 0 )
+    , m_next(       arg_dependence )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count( 0 )
+    , m_task_type(  arg_task_type )
+    , m_priority(   arg_priority )
+    {}
+
+  // Constructor for an aggregate task
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase( queue_type  * arg_queue
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_dep_count
+                    ) noexcept
+    : m_apply( 0 )
+    , m_queue( arg_queue )
+    , m_wait( 0 )
+    , m_next( 0 )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count(  arg_dep_count )
+    , m_task_type(  Aggregate )
+    , m_priority( 0 )
     {}
 
   //----------------------------------------
@@ -406,9 +430,13 @@ public:
   KOKKOS_INLINE_FUNCTION
   void add_dependence( TaskBase* dep )
     {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
       // Assign dependence to m_next.  It will be processed in the subsequent
       // call to schedule.  Error if the dependence is reset.
-      if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
         Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
       }
 
@@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
 {
 private:
 
-  static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
+  using root_type     = TaskBase<ExecSpace,void,void> ;
+  using function_type = typename root_type::function_type ;
+  using queue_type    = typename root_type::queue_type ;
 
+  static_assert( sizeof(root_type) == 48 , "" );
+
+  TaskBase() = delete ;
   TaskBase( TaskBase && ) = delete ;
   TaskBase( const TaskBase & ) = delete ;
   TaskBase & operator = ( TaskBase && ) = delete ;
@@ -444,9 +477,24 @@ public:
 
   KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
 
+  // Constructor for runnable task
   KOKKOS_INLINE_FUNCTION
-  TaskBase()
-    : TaskBase< ExecSpace , void , void >()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    )
+    : root_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
     , m_result()
     {}
 
@@ -471,11 +519,14 @@ private:
 
 public:
 
-  using root_type    = TaskBase< ExecSpace , void , void > ;
-  using base_type    = TaskBase< ExecSpace , ResultType , void > ;
-  using member_type  = TaskExec< ExecSpace > ;
-  using functor_type = FunctorType ;
-  using result_type  = ResultType ;
+  using root_type       = TaskBase< ExecSpace , void , void > ;
+  using base_type       = TaskBase< ExecSpace , ResultType , void > ;
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using function_type   = typename root_type::function_type ;
+  using queue_type      = typename root_type::queue_type ;
+  using member_type     = typename specialization::member_type ;
+  using functor_type    = FunctorType ;
+  using result_type     = ResultType ;
 
   template< typename Type >
   KOKKOS_INLINE_FUNCTION static
@@ -522,13 +573,30 @@ public:
       if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
         // Did not respawn, destroy the functor to free memory.
         static_cast<functor_type*>(task)->~functor_type();
-        // Cannot destroy the task until its dependences have been processed.
+        // Cannot destroy and deallocate the task until its dependences
+        // have been processed.
       }
     }
 
+  // Constructor for runnable task
   KOKKOS_INLINE_FUNCTION
-  TaskBase( functor_type const & arg_functor )
-    : base_type()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    , FunctorType && arg_functor
+                    )
+    : base_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
     , functor_type( arg_functor )
     {}
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index fefbbad8bde297ce94ad99058e6f25eca6046b7e..23f5d3cd30dbbf87c024af935356961c1642a022 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
   )
 {
   // Push task into a concurrently pushed and popped queue.
+  // The queue can be either a ready task queue or a waiting task queue.
   // The queue is a linked list where 'task->m_next' form the links.
   // Fail the push attempt if the queue is locked;
   // otherwise retry until the push succeeds.
@@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
 template< typename ExecSpace >
 KOKKOS_FUNCTION
 typename TaskQueue< ExecSpace >::task_root_type *
-TaskQueue< ExecSpace >::pop_task
+TaskQueue< ExecSpace >::pop_ready_task
   ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
 {
-  // Pop task from a concurrently pushed and popped queue.
+  // Pop task from a concurrently pushed and popped ready task queue.
   // The queue is a linked list where 'task->m_next' form the links.
 
-  task_root_type * const zero = (task_root_type *) 0 ;
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
@@ -252,85 +252,83 @@ TaskQueue< ExecSpace >::pop_task
     // (1) lock, (2) end, or (3) a valid task.
     // Thus zero will never appear in the queue.
     //
-    // If queue is locked then just read by guaranteeing
-    // the CAS will fail.
+    // If queue is locked then just read by guaranteeing the CAS will fail.
 
     if ( lock == task ) task = 0 ;
 
     task_root_type * const x = task ;
 
-    task = Kokkos::atomic_compare_exchange(queue,task,lock);
-
-    if ( x == task ) break ; // CAS succeeded and queue is locked
-  }
+    task = Kokkos::atomic_compare_exchange(queue,x,lock);
 
-  if ( end != task ) {
+    if ( x == task ) {
+      // CAS succeeded and queue is locked
+      //
+      // This thread has locked the queue and removed 'task' from the queue.
+      // Extract the next entry of the queue from 'task->m_next'
+      // and mark 'task' as popped from a queue by setting
+      // 'task->m_next = lock'.
+      //
+      // Place the next entry in the head of the queue,
+      // which also unlocks the queue.
+      //
+      // This thread has exclusive access to
+      // the queue and the popped task's m_next.
 
-    // This thread has locked the queue and removed 'task' from the queue.
-    // Extract the next entry of the queue from 'task->m_next'
-    // and mark 'task' as popped from a queue by setting
-    // 'task->m_next = lock'.
+      *queue = task->m_next ; task->m_next = lock ;
 
-    task_root_type * const next =
-      Kokkos::atomic_exchange( & task->m_next , lock );
+      Kokkos::memory_fence();
 
-    // Place the next entry in the head of the queue,
-    // which also unlocks the queue.
-
-    task_root_type * const unlock =
-      Kokkos::atomic_exchange( queue , next );
+#if 0
+      printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+            , uintptr_t(queue)
+            , uintptr_t(task)
+            , uintptr_t(task->m_wait)
+            , uintptr_t(task->m_next)
+            , int(task->m_task_type)
+            , int(task->m_priority)
+            , int(task->m_ref_count) );
+#endif
 
-    if ( next == zero || next == lock || lock != unlock ) {
-      Kokkos::abort("TaskQueue::pop_task ERROR");
+      return task ;
     }
   }
 
-#if 0
-  if ( end != task ) {
-    printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
-          , uintptr_t(queue)
-          , uintptr_t(task)
-          , uintptr_t(task->m_wait)
-          , uintptr_t(task->m_next)
-          , int(task->m_task_type)
-          , int(task->m_priority)
-          , int(task->m_ref_count) );
-  }
-#endif
-
-  return task ;
+  return end ;
 }
 
 //----------------------------------------------------------------------------
 
 template< typename ExecSpace >
 KOKKOS_FUNCTION
-void TaskQueue< ExecSpace >::schedule
+void TaskQueue< ExecSpace >::schedule_runnable
   ( TaskQueue< ExecSpace >::task_root_type * const task )
 {
-  // Schedule a runnable or when_all task upon construction / spawn
+  // Schedule a runnable task upon construction / spawn
   // and upon completion of other tasks that 'task' is waiting on.
-
-  // Precondition on runnable task state:
-  //   task is either constructing or executing
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  // - if runnable then task is either constructing or respawning
   //
   //   Constructing state:
   //     task->m_wait == 0
-  //     task->m_next == dependence
-  //   Executing-respawn state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == dependence
+  //     task->m_next == dependence or 0
+  //   Respawn state:
+  //     task->m_wait == head of linked list: 'end' or valid task
+  //     task->m_next == dependence or 0
   //
   //  Task state transition:
-  //     Constructing      ->  Waiting
-  //     Executing-respawn ->  Waiting
+  //     Constructing ->  Waiting
+  //     Respawn      ->  Waiting
   //
   //  Postcondition on task state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == member of linked list
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
 
 #if 0
-  printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+  printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
         , uintptr_t(task)
         , uintptr_t(task->m_wait)
         , uintptr_t(task->m_next)
@@ -343,135 +341,204 @@ void TaskQueue< ExecSpace >::schedule
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
-  //----------------------------------------
-  {
-    // If Constructing then task->m_wait == 0
-    // Change to waiting by task->m_wait = EndTag
-
-    task_root_type * const init =
-      Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
+  bool respawn = false ;
 
-    // Precondition
+  //----------------------------------------
 
-    if ( lock == init ) {
-      Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
-    }
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
 
-    // if ( init == 0 ) Constructing       ->  Waiting
-    // else             Executing-Respawn  ->  Waiting
+    task->m_wait = end ;
+    // Task in Waiting state
   }
+  else if ( lock != task->m_wait ) {
+    // Task in Executing state with Respawn request
+    // - Update dependence
+    // - Transition to Waiting state
+    respawn = true ;
+  }
+  else {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
+  }
+
   //----------------------------------------
+  // Scheduling a runnable task which may have a depencency 'dep'.
+  // Extract dependence, if any, from task->m_next.
+  // If 'dep' is not null then attempt to push 'task'
+  // into the wait queue of 'dep'.
+  // If the push succeeds then 'task' may be
+  // processed or executed by another thread at any time.
+  // If the push fails then 'dep' is complete and 'task'
+  // is ready to execute.
+
+  // Exclusive access so don't need an atomic exchange
+  // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+  task_root_type * dep = task->m_next ; task->m_next = zero ;
+
+  const bool is_ready = 
+    ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+  if ( ( 0 != dep ) && respawn ) {
+    // Reference count for dep was incremented when
+    // respawn assigned dependency to task->m_next
+    // so that if dep completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+  }
 
-  if ( task_root_type::Aggregate != task->m_task_type ) {
+  if ( is_ready ) {
 
-    // Scheduling a runnable task which may have a depencency 'dep'.
-    // Extract dependence, if any, from task->m_next.
-    // If 'dep' is not null then attempt to push 'task'
-    // into the wait queue of 'dep'.
-    // If the push succeeds then 'task' may be
-    // processed or executed by another thread at any time.
-    // If the push fails then 'dep' is complete and 'task'
-    // is ready to execute.
+    // No dependence or 'dep' is complete so push task into ready queue.
+    // Increment the ready count before pushing into ready queue
+    // to track number of ready + executing tasks.
+    // The ready count will be decremented when the task is complete.
 
-    task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+    Kokkos::atomic_increment( & m_ready_count );
 
-    const bool is_ready =
-      ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+    task_root_type * volatile * const ready_queue =
+      & m_ready[ task->m_priority ][ task->m_task_type ];
 
-    // Reference count for dep was incremented when assigned
-    // to task->m_next so that if it completed prior to the
-    // above push_task dep would not be destroyed.
-    // dep reference count can now be decremented,
-    // which may deallocate the task.
-    TaskQueue::assign( & dep , (task_root_type *)0 );
+    // A push_task fails if the ready queue is locked.
+    // A ready queue is only locked during a push or pop;
+    // i.e., it is never permanently locked.
+    // Retry push to ready queue until it succeeds.
+    // When the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
 
-    if ( is_ready ) {
+    while ( ! push_task( ready_queue , task ) );
+  }
 
-      // No dependence or 'dep' is complete so push task into ready queue.
-      // Increment the ready count before pushing into ready queue
-      // to track number of ready + executing tasks.
-      // The ready count will be decremented when the task is complete.
+  //----------------------------------------
+  // Postcondition:
+  // - A runnable 'task' was pushed into a wait or ready queue.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
+}
 
-      Kokkos::atomic_increment( & m_ready_count );
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule_aggregate
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule an aggregate task upon construction
+  // and upon completion of other tasks that 'task' is waiting on.
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence or 0
+  //
+  //  Task state transition:
+  //     Constructing ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
+
+#if 0
+  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
 
-      task_root_type * volatile * const queue =
-        & m_ready[ task->m_priority ][ task->m_task_type ];
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
-      // A push_task fails if the ready queue is locked.
-      // A ready queue is only locked during a push or pop;
-      // i.e., it is never permanently locked.
-      // Retry push to ready queue until it succeeds.
-      // When the push succeeds then 'task' may be
-      // processed or executed by another thread at any time.
+  //----------------------------------------
 
-      while ( ! push_task( queue , task ) );
-    }
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
+
+    task->m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock == task->m_wait ) {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
   }
+
   //----------------------------------------
-  else {
-    // Scheduling a 'when_all' task with multiple dependences.
-    // This scheduling may be called when the 'when_all' is
-    // (1) created or
-    // (2) being removed from a completed task's wait list.
+  // Scheduling a 'when_all' task with multiple dependences.
+  // This scheduling may be called when the 'when_all' is
+  // (1) created or
+  // (2) being removed from a completed task's wait list.
 
-    task_root_type ** const aggr = task->aggregate_dependences();
+  task_root_type ** const aggr = task->aggregate_dependences();
 
-    // Assume the 'when_all' is complete until a dependence is
-    // found that is not complete.
+  // Assume the 'when_all' is complete until a dependence is
+  // found that is not complete.
 
-    bool is_complete = true ;
+  bool is_complete = true ;
 
-    for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
+  for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
 
-      --i ;
+    --i ;
 
-      // Loop dependences looking for an incomplete task.
-      // Add this task to the incomplete task's wait queue.
+    // Loop dependences looking for an incomplete task.
+    // Add this task to the incomplete task's wait queue.
 
-      // Remove a task 'x' from the dependence list.
-      // The reference count of 'x' was incremented when
-      // it was assigned into the dependence list.
+    // Remove a task 'x' from the dependence list.
+    // The reference count of 'x' was incremented when
+    // it was assigned into the dependence list.
 
-      task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    // Exclusive access so don't need an atomic exchange
+    // task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    task_root_type * x = aggr[i] ; aggr[i] = zero ;
 
-      if ( x ) {
+    if ( x ) {
 
-        // If x->m_wait is not locked then push succeeds
-        // and the aggregate is not complete.
-        // If the push succeeds then this when_all 'task' may be
-        // processed by another thread at any time.
-        // For example, 'x' may be completeed by another
-        // thread and then re-schedule this when_all 'task'.
+      // If x->m_wait is not locked then push succeeds
+      // and the aggregate is not complete.
+      // If the push succeeds then this when_all 'task' may be
+      // processed by another thread at any time.
+      // For example, 'x' may be completeed by another
+      // thread and then re-schedule this when_all 'task'.
 
-        is_complete = ! push_task( & x->m_wait , task );
+      is_complete = ! push_task( & x->m_wait , task );
 
-        // Decrement reference count which had been incremented
-        // when 'x' was added to the dependence list.
+      // Decrement reference count which had been incremented
+      // when 'x' was added to the dependence list.
 
-        TaskQueue::assign( & x , zero );
-      }
+      TaskQueue::assign( & x , zero );
     }
+  }
 
-    if ( is_complete ) {
-      // The when_all 'task' was not added to a wait queue because
-      // all dependences were complete so this aggregate is complete.
-      // Complete the when_all 'task' to schedule other tasks
-      // that are waiting for the when_all 'task' to complete.
+  if ( is_complete ) {
+    // The when_all 'task' was not added to a wait queue because
+    // all dependences were complete so this aggregate is complete.
+    // Complete the when_all 'task' to schedule other tasks
+    // that are waiting for the when_all 'task' to complete.
 
-      task->m_next = lock ;
+    task->m_next = lock ;
 
-      complete( task );
+    complete( task );
 
-      // '*task' may have been deleted upon completion
-    }
+    // '*task' may have been deleted upon completion
   }
+
   //----------------------------------------
   // Postcondition:
-  //   A runnable 'task' was pushed into a wait or ready queue.
-  //   An aggregate 'task' was either pushed to a wait queue
-  //   or completed.
-  // Concurrent execution may have already popped 'task'
-  // from a queue and processed it as appropriate.
+  // - An aggregate 'task' was either pushed to a wait queue or completed.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
 }
 
 //----------------------------------------------------------------------------
@@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
     // Is a runnable task has finished executing and requested respawn.
     // Schedule the task for subsequent execution.
 
-    schedule( task );
+    schedule_runnable( task );
   }
   //----------------------------------------
   else {
@@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
       TaskQueue::assign( & task , zero );
 
       // This thread has exclusive access to the wait list so
-      // the concurrency-safe pop_task function is not needed.
+      // the concurrency-safe pop_ready_task function is not needed.
       // Schedule the tasks that have been waiting on the input 'task',
       // which may have been deleted.
 
       while ( x != end ) {
+        // Have exclusive access to 'x' until it is scheduled
+        // Set x->m_next = zero  <=  no dependence, not a respawn
 
-        // Set x->m_next = zero  <=  no dependence
-
-        task_root_type * const next =
-          (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
+        task_root_type * const next = x->m_next ; x->m_next = 0 ;
 
-        schedule( x );
+        if ( task_root_type::Aggregate != x->m_task_type ) {
+          schedule_runnable( x );
+        }
+        else {
+          schedule_aggregate( x );
+        }
 
         x = next ;
       }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
index ff503cb27329c006aeb0b476c2dd54e09d43baa4..d72cde03fd2bb1ae40559c80d007f7a8836636c0 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -45,6 +45,7 @@
 #define KOKKOS_CORE_IMPL_UTILITIES_HPP
 
 #include <Kokkos_Macros.hpp>
+#include <stdint.h>
 #include <type_traits>
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
index ad1b6dce39d03182d1187105d79a9cb8e239ac8e..93ff6c48a77d00e45e3028413d5c02f4020d65bc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,52 +36,144 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <Kokkos_Macros.hpp>
+
 #include <impl/Kokkos_spinwait.hpp>
 
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
 /*--------------------------------------------------------------------------*/
 
-#if ( KOKKOS_ENABLE_ASM )
-  #if defined( __arm__ ) || defined( __aarch64__ )
-    /* No-operation instruction to idle the thread. */
-    #define YIELD   asm volatile("nop")
+#if !defined( _WIN32 )
+  #if defined( KOKKOS_ENABLE_ASM )
+    #if defined( __arm__ ) || defined( __aarch64__ )
+      /* No-operation instruction to idle the thread. */
+      #define KOKKOS_INTERNAL_PAUSE
+    #else
+      /* Pause instruction to prevent excess processor bus usage */
+      #define KOKKOS_INTERNAL_PAUSE   asm volatile("pause\n":::"memory")
+    #endif
+    #define KOKKOS_INTERNAL_NOP2    asm volatile("nop\n" "nop\n")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0u:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1u:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2u:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3u:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
   #else
-    /* Pause instruction to prevent excess processor bus usage */
-    #define YIELD   asm volatile("pause\n":::"memory")
+    #include <sched.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      sched_yield();
+    }
+    }
+  #endif
+#else // defined( _WIN32 )
+  #if defined ( KOKKOS_ENABLE_WINTHREAD )
+    #include <process.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      Sleep(0);
+    }
+    }
+  #elif defined( _MSC_VER )
+    #define NOMINMAX
+    #include <winsock2.h>
+    #include <windows.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      YieldProcessor();
+    }
+    }
+  #else
+    #define KOKKOS_INTERNAL_PAUSE   __asm__ __volatile__("pause\n":::"memory")
+    #define KOKKOS_INTERNAL_NOP2    __asm__ __volatile__("nop\n" "nop")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
   #endif
-#elif defined ( KOKKOS_ENABLE_WINTHREAD )
-  #include <process.h>
-  #define YIELD  Sleep(0)
-#elif defined ( _WIN32)  && defined (_MSC_VER)
-  /* Windows w/ Visual Studio */
-  #define NOMINMAX
-  #include <winsock2.h>
-  #include <windows.h>
-#define YIELD YieldProcessor();
-#elif defined ( _WIN32 )
-  /* Windows w/ Intel*/
-  #define YIELD __asm__ __volatile__("pause\n":::"memory")
-#else
-  #include <sched.h>
-  #define YIELD  sched_yield()
 #endif
 
+
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
 namespace Impl {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value )
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value == flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
 {
+  Kokkos::store_fence();
+  unsigned i = 0;
   while ( value == flag ) {
-    YIELD ;
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
   }
+  Kokkos::load_fence();
 }
+
 #endif
 
 } /* namespace Impl */
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
index cc87771faefcb8ad7716842890dbec4a9c1219a1..6e34b8a943d164eea1af317be66928a26a9e4ab2 100644
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -47,14 +47,30 @@
 
 #include <Kokkos_Macros.hpp>
 
+#include <cstdint>
+
 namespace Kokkos {
 namespace Impl {
 
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value );
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
 #else
+
+KOKKOS_INLINE_FUNCTION
+void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
+KOKKOS_INLINE_FUNCTION
+void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
+
+KOKKOS_INLINE_FUNCTION
+void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
 KOKKOS_INLINE_FUNCTION
-void spinwait( volatile int & , const int ) {}
+void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
+
 #endif
 
 } /* namespace Impl */
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
index 795657fe876233c8ef7f962bdce12be4d0452e2f..caf6c50129f090cd13cd92e67a79880949e821a1 100644
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -115,10 +115,31 @@ IF(Kokkos_ENABLE_OpenMP)
   )
 ENDIF()
 
-IF(Kokkos_ENABLE_QTHREAD)
+IF(Kokkos_ENABLE_Qthreads)
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Qthread
-    SOURCES UnitTestMain.cpp TestQthread.cpp
+    UnitTest_Qthreads
+    SOURCES
+      UnitTestMain.cpp
+      qthreads/TestQthreads_Atomics.cpp
+      qthreads/TestQthreads_Other.cpp
+      qthreads/TestQthreads_Reductions.cpp
+      qthreads/TestQthreads_SubView_a.cpp
+      qthreads/TestQthreads_SubView_b.cpp
+      qthreads/TestQthreads_SubView_c01.cpp
+      qthreads/TestQthreads_SubView_c02.cpp
+      qthreads/TestQthreads_SubView_c03.cpp
+      qthreads/TestQthreads_SubView_c04.cpp
+      qthreads/TestQthreads_SubView_c05.cpp
+      qthreads/TestQthreads_SubView_c06.cpp
+      qthreads/TestQthreads_SubView_c07.cpp
+      qthreads/TestQthreads_SubView_c08.cpp
+      qthreads/TestQthreads_SubView_c09.cpp
+      qthreads/TestQthreads_SubView_c10.cpp
+      qthreads/TestQthreads_SubView_c11.cpp
+      qthreads/TestQthreads_SubView_c12.cpp
+      qthreads/TestQthreads_Team.cpp
+      qthreads/TestQthreads_ViewAPI_a.cpp
+      qthreads/TestQthreads_ViewAPI_b.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -194,4 +215,3 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   FAIL_REGULAR_EXPRESSION "  FAILED  "
     TESTONLYLIBS kokkos_gtest
 )
-
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
index cc59825fba85d17b67c0694de1198acd240587d9..d93830a28d9db5ae50306c70ae5187062a07c594 100644
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -6,6 +6,7 @@ vpath %.cpp ${KOKKOS_PATH}/core/unit_test
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/qthreads
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda
 
 TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
@@ -35,15 +36,15 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	OBJ_CUDA = TestCuda_Other.o TestCuda_Reductions_a.o TestCuda_Reductions_b.o TestCuda_Atomics.o TestCuda_Team.o TestCuda_Spaces.o
 	OBJ_CUDA += TestCuda_SubView_a.o TestCuda_SubView_b.o
 ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-        OBJ_OPENMP += TestCuda_SubView_c_all.o
+	OBJ_OPENMP += TestCuda_SubView_c_all.o
 else
 	OBJ_CUDA += TestCuda_SubView_c01.o TestCuda_SubView_c02.o TestCuda_SubView_c03.o
-	OBJ_CUDA += TestCuda_SubView_c04.o  TestCuda_SubView_c05.o  TestCuda_SubView_c06.o
-	OBJ_CUDA += TestCuda_SubView_c07.o  TestCuda_SubView_c08.o  TestCuda_SubView_c09.o
+	OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o
+	OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o
 	OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o
 endif
-	OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o  TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o
-	OBJ_CUDA += TestCuda_ViewAPI_e.o TestCuda_ViewAPI_f.o  TestCuda_ViewAPI_g.o TestCuda_ViewAPI_h.o
+	OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o
+	OBJ_CUDA += TestCuda_ViewAPI_e.o TestCuda_ViewAPI_f.o TestCuda_ViewAPI_g.o TestCuda_ViewAPI_h.o
 	OBJ_CUDA += TestCuda_ViewAPI_s.o
 	OBJ_CUDA += UnitTestMain.o gtest-all.o
 	TARGETS += KokkosCore_UnitTest_Cuda
@@ -51,13 +52,13 @@ endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	OBJ_THREADS = TestThreads_Other.o TestThreads_Reductions.o TestThreads_Atomics.o TestThreads_Team.o 
-	OBJ_THREADS += TestThreads_SubView_a.o TestThreads_SubView_b.o 
+	OBJ_THREADS = TestThreads_Other.o TestThreads_Reductions.o TestThreads_Atomics.o TestThreads_Team.o
+	OBJ_THREADS += TestThreads_SubView_a.o TestThreads_SubView_b.o
 	OBJ_THREADS += TestThreads_SubView_c01.o TestThreads_SubView_c02.o TestThreads_SubView_c03.o
-	OBJ_THREADS += TestThreads_SubView_c04.o  TestThreads_SubView_c05.o  TestThreads_SubView_c06.o  
-	OBJ_THREADS += TestThreads_SubView_c07.o  TestThreads_SubView_c08.o  TestThreads_SubView_c09.o
+	OBJ_THREADS += TestThreads_SubView_c04.o TestThreads_SubView_c05.o TestThreads_SubView_c06.o
+	OBJ_THREADS += TestThreads_SubView_c07.o TestThreads_SubView_c08.o TestThreads_SubView_c09.o
 	OBJ_THREADS += TestThreads_SubView_c10.o TestThreads_SubView_c11.o TestThreads_SubView_c12.o
-	OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o UnitTestMain.o gtest-all.o 
+	OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosCore_UnitTest_Threads
 	TEST_TARGETS += test-threads
 endif
@@ -66,11 +67,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	OBJ_OPENMP = TestOpenMP_Other.o TestOpenMP_Reductions.o TestOpenMP_Atomics.o TestOpenMP_Team.o
 	OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o
 ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-        OBJ_OPENMP += TestOpenMP_SubView_c_all.o
+	OBJ_OPENMP += TestOpenMP_SubView_c_all.o
 else
 	OBJ_OPENMP += TestOpenMP_SubView_c01.o TestOpenMP_SubView_c02.o TestOpenMP_SubView_c03.o
-	OBJ_OPENMP += TestOpenMP_SubView_c04.o  TestOpenMP_SubView_c05.o  TestOpenMP_SubView_c06.o
-	OBJ_OPENMP += TestOpenMP_SubView_c07.o  TestOpenMP_SubView_c08.o  TestOpenMP_SubView_c09.o
+	OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o
+	OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o
 	OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o
 endif
 	OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o UnitTestMain.o gtest-all.o
@@ -78,28 +79,38 @@ endif
 	TEST_TARGETS += test-openmp
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+	OBJ_QTHREADS = TestQthreads_Other.o TestQthreads_Reductions.o TestQthreads_Atomics.o TestQthreads_Team.o
+	OBJ_QTHREADS += TestQthreads_SubView_a.o TestQthreads_SubView_b.o
+ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+	OBJ_QTHREADS += TestQthreads_SubView_c_all.o
+else
+	OBJ_QTHREADS += TestQthreads_SubView_c01.o TestQthreads_SubView_c02.o TestQthreads_SubView_c03.o
+	OBJ_QTHREADS += TestQthreads_SubView_c04.o TestQthreads_SubView_c05.o TestQthreads_SubView_c06.o
+	OBJ_QTHREADS += TestQthreads_SubView_c07.o TestQthreads_SubView_c08.o TestQthreads_SubView_c09.o
+	OBJ_QTHREADS += TestQthreads_SubView_c10.o TestQthreads_SubView_c11.o TestQthreads_SubView_c12.o
+endif
+	OBJ_QTHREADS += TestQthreads_ViewAPI_a.o TestQthreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Qthreads
+	TEST_TARGETS += test-qthreads
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-	OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o 
-	OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o 
+	OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o
+	OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o
 ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-        OBJ_OPENMP += TestSerial_SubView_c_all.o
+	OBJ_OPENMP += TestSerial_SubView_c_all.o
 else
 	OBJ_SERIAL += TestSerial_SubView_c01.o TestSerial_SubView_c02.o TestSerial_SubView_c03.o
-	OBJ_SERIAL += TestSerial_SubView_c04.o  TestSerial_SubView_c05.o  TestSerial_SubView_c06.o  
-	OBJ_SERIAL += TestSerial_SubView_c07.o  TestSerial_SubView_c08.o  TestSerial_SubView_c09.o
+	OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o
+	OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o
 	OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o
 endif
-	OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o UnitTestMain.o gtest-all.o 
+	OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosCore_UnitTest_Serial
 	TEST_TARGETS += test-serial
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
-	TARGETS += KokkosCore_UnitTest_Qthread
-	TEST_TARGETS += test-qthread
-endif
-
 OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_HWLOC
 TEST_TARGETS += test-hwloc
@@ -115,10 +126,6 @@ TARGETS += ${INITTESTS_TARGETS}
 INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
 TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
 
-OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
-TARGETS += KokkosCore_UnitTest_Synchronic
-TEST_TARGETS += test-synchronic
-
 KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda
 
@@ -131,8 +138,8 @@ KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial
 
-KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
+KokkosCore_UnitTest_Qthreads: $(OBJ_QTHREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthreads
 
 KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
@@ -146,9 +153,6 @@ KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
 ${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
 
-KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
-
 test-cuda: KokkosCore_UnitTest_Cuda
 	./KokkosCore_UnitTest_Cuda
 
@@ -161,8 +165,8 @@ test-openmp: KokkosCore_UnitTest_OpenMP
 test-serial: KokkosCore_UnitTest_Serial
 	./KokkosCore_UnitTest_Serial
 
-test-qthread: KokkosCore_UnitTest_Qthread
-	./KokkosCore_UnitTest_Qthread
+test-qthreads: KokkosCore_UnitTest_Qthreads
+	./KokkosCore_UnitTest_Qthreads
 
 test-hwloc: KokkosCore_UnitTest_HWLOC
 	./KokkosCore_UnitTest_HWLOC
@@ -176,9 +180,6 @@ test-default: KokkosCore_UnitTest_Default
 ${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
 	./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
 
-test-synchronic: KokkosCore_UnitTest_Synchronic
-	./KokkosCore_UnitTest_Synchronic
-
 build_all: $(TARGETS)
 
 test: $(TEST_TARGETS)
@@ -193,4 +194,3 @@ clean: kokkos-clean
 
 gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
-
diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp
index d22837f3ed7b67bccecfbe11ba4d71266a094616..f09cc5018cb698ec033639a326a29d8fffacec3f 100644
--- a/lib/kokkos/core/unit_test/TestAggregate.hpp
+++ b/lib/kokkos/core/unit_test/TestAggregate.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,8 +50,6 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 #include <impl/Kokkos_ViewArray.hpp>
 
 namespace Test {
@@ -59,51 +57,68 @@ namespace Test {
 template< class DeviceType >
 void TestViewAggregate()
 {
-  typedef Kokkos::Array<double,32>  value_type ;
-
-  typedef Kokkos::Experimental::Impl::
-    ViewDataAnalysis< value_type * , Kokkos::LayoutLeft , value_type >
-      analysis_1d ;
+  typedef Kokkos::Array< double, 32 >  value_type;
+  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d;
 
-  static_assert( std::is_same< typename analysis_1d::specialize , Kokkos::Array<> >::value , "" );
+  static_assert( std::is_same< typename analysis_1d::specialize, Kokkos::Array<> >::value, "" );
 
+  typedef Kokkos::ViewTraits< value_type **, DeviceType > a32_traits;
+  typedef Kokkos::ViewTraits< typename a32_traits::scalar_array_type, DeviceType > flat_traits;
 
-  typedef Kokkos::ViewTraits< value_type ** , DeviceType > a32_traits ;
-  typedef Kokkos::ViewTraits< typename a32_traits::scalar_array_type , DeviceType > flat_traits ;
+  static_assert( std::is_same< typename a32_traits::specialize, Kokkos::Array<> >::value, "" );
+  static_assert( std::is_same< typename a32_traits::value_type, value_type >::value, "" );
+  static_assert( a32_traits::rank == 2, "" );
+  static_assert( a32_traits::rank_dynamic == 2, "" );
 
-  static_assert( std::is_same< typename a32_traits::specialize , Kokkos::Array<> >::value , "" );
-  static_assert( std::is_same< typename a32_traits::value_type , value_type >::value , "" );
-  static_assert( a32_traits::rank == 2 , "" );
-  static_assert( a32_traits::rank_dynamic == 2 , "" );
+  static_assert( std::is_same< typename flat_traits::specialize, void >::value, "" );
+  static_assert( flat_traits::rank == 3, "" );
+  static_assert( flat_traits::rank_dynamic == 2, "" );
+  static_assert( flat_traits::dimension::N2 == 32, "" );
 
-  static_assert( std::is_same< typename flat_traits::specialize , void >::value , "" );
-  static_assert( flat_traits::rank == 3 , "" );
-  static_assert( flat_traits::rank_dynamic == 2 , "" );
-  static_assert( flat_traits::dimension::N2 == 32 , "" );
+  typedef Kokkos::View< Kokkos::Array< double, 32 > **, DeviceType > a32_type;
+  typedef typename a32_type::array_type  a32_flat_type;
 
+  static_assert( std::is_same< typename a32_type::value_type, value_type >::value, "" );
+  static_assert( std::is_same< typename a32_type::pointer_type, double * >::value, "" );
+  static_assert( a32_type::Rank == 2, "" );
+  static_assert( a32_flat_type::Rank == 3, "" );
 
-  typedef Kokkos::View< Kokkos::Array<double,32> ** , DeviceType > a32_type ;
-
-  typedef typename a32_type::array_type  a32_flat_type ;
-
-  static_assert( std::is_same< typename a32_type::value_type , value_type >::value , "" );
-  static_assert( std::is_same< typename a32_type::pointer_type , double * >::value , "" );
-  static_assert( a32_type::Rank == 2 , "" );
-  static_assert( a32_flat_type::Rank == 3 , "" );
-
-  a32_type x("test",4,5);
+  a32_type x( "test", 4, 5 );
   a32_flat_type y( x );
 
-  ASSERT_EQ( x.extent(0) , 4 );
-  ASSERT_EQ( x.extent(1) , 5 );
-  ASSERT_EQ( y.extent(0) , 4 );
-  ASSERT_EQ( y.extent(1) , 5 );
-  ASSERT_EQ( y.extent(2) , 32 );
-}
-
+  ASSERT_EQ( x.extent( 0 ), 4 );
+  ASSERT_EQ( x.extent( 1 ), 5 );
+  ASSERT_EQ( y.extent( 0 ), 4 );
+  ASSERT_EQ( y.extent( 1 ), 5 );
+  ASSERT_EQ( y.extent( 2 ), 32 );
+
+  // Initialize arrays from brace-init-list as for std::array.
+  //
+  // Comment: Clang will issue the following warning if we don't use double
+  //          braces here (one for initializing the Kokkos::Array and one for
+  //          initializing the sub-aggreagate C-array data member),
+  //
+  //            warning: suggest braces around initialization of subobject
+  //
+  //          but single brace syntax would be valid as well.
+  Kokkos::Array< float, 2 > aggregate_initialization_syntax_1 = { { 1.41, 3.14 } };
+  ASSERT_FLOAT_EQ( aggregate_initialization_syntax_1[0], 1.41 );
+  ASSERT_FLOAT_EQ( aggregate_initialization_syntax_1[1], 3.14 );
+
+  Kokkos::Array< int, 3 > aggregate_initialization_syntax_2{ { 0, 1, 2 } }; // since C++11
+  for ( int i = 0; i < 3; ++i ) {
+    ASSERT_EQ( aggregate_initialization_syntax_2[i], i );
+  }
+
+  // Note that this is a valid initialization.
+  Kokkos::Array< double, 3 > initialized_with_one_argument_missing = { { 255, 255 } };
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_DOUBLE_EQ( initialized_with_one_argument_missing[i], 255 );
+  }
+  // But the following line would not compile
+//  Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } };
 }
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
+} // namespace Test
 
 #endif /* #ifndef TEST_AGGREGATE_HPP */
diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp
index e948723574b48b2a64ee66c487062e34c0ccf29b..ff77b8dca6f0437393bacca9d42ed73d359e44d5 100644
--- a/lib/kokkos/core/unit_test/TestAtomic.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomic.hpp
@@ -45,116 +45,129 @@
 
 namespace TestAtomic {
 
-// Struct for testing arbitrary size atomics
+// Struct for testing arbitrary size atomics.
 
-template<int N>
+template< int N >
 struct SuperScalar {
   double val[N];
 
   KOKKOS_INLINE_FUNCTION
   SuperScalar() {
-    for(int i=0; i<N; i++)
+    for ( int i = 0; i < N; i++ ) {
       val[i] = 0.0;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar(const SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar(const volatile SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar( const volatile SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator = (const SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar& operator=( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator = (const volatile SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar& operator=( const volatile SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator = (const SuperScalar& src) volatile  {
-    for(int i=0; i<N; i++)
+  void operator=( const SuperScalar & src ) volatile  {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar operator + (const SuperScalar& src) {
+  SuperScalar operator+( const SuperScalar & src ) {
     SuperScalar tmp = *this;
-    for(int i=0; i<N; i++)
+    for ( int i = 0; i < N; i++ ) {
       tmp.val[i] += src.val[i];
+    }
     return tmp;
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator += (const double& src) {
-    for(int i=0; i<N; i++)
-      val[i] += 1.0*(i+1)*src;
+  SuperScalar& operator+=( const double & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] += 1.0 * ( i + 1 ) * src;
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator += (const SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar& operator+=( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] += src.val[i];
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const SuperScalar& src) {
+  bool operator==( const SuperScalar & src ) {
     bool compare = true;
-    for(int i=0; i<N; i++)
-      compare = compare && ( val[i] == src.val[i]);
+    for( int i = 0; i < N; i++ ) {
+      compare = compare && ( val[i] == src.val[i] );
+    }
     return compare;
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const SuperScalar& src) {
+  bool operator!=( const SuperScalar & src ) {
     bool compare = true;
-    for(int i=0; i<N; i++)
-      compare = compare && ( val[i] == src.val[i]);
+    for ( int i = 0; i < N; i++ ) {
+      compare = compare && ( val[i] == src.val[i] );
+    }
     return !compare;
   }
 
-
-
   KOKKOS_INLINE_FUNCTION
-  SuperScalar(const double& src) {
-    for(int i=0; i<N; i++)
-      val[i] = 1.0 * (i+1) * src;
+  SuperScalar( const double & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = 1.0 * ( i + 1 ) * src;
+    }
   }
-
 };
 
-template<int N>
-std::ostream& operator<<(std::ostream& os, const SuperScalar<N>& dt)
+template< int N >
+std::ostream & operator<<( std::ostream & os, const SuperScalar< N > & dt )
 {
-    os << "{ ";
-    for(int i=0;i<N-1;i++)
-       os << dt.val[i] << ", ";
-    os << dt.val[N-1] << "}";
-    return os;
+  os << "{ ";
+  for ( int  i = 0; i < N - 1; i++ ) {
+     os << dt.val[i] << ", ";
+  }
+  os << dt.val[N-1] << "}";
+
+  return os;
 }
 
-template<class T,class DEVICE_TYPE>
+template< class T, class DEVICE_TYPE >
 struct ZeroFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T,execution_space> type;
-  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
   type data;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
+  void operator()( int ) const {
     data() = 0;
   }
 };
@@ -163,47 +176,53 @@ struct ZeroFunctor {
 //--------------atomic_fetch_add---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct AddFunctor{
+template< class T, class DEVICE_TYPE >
+struct AddFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_add(&data(),(T)1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_add( &data(), (T) 1 );
   }
 };
 
-template<class T, class execution_space >
-T AddLoop(int loop) {
-  struct ZeroFunctor<T,execution_space> f_zero;
-  typename ZeroFunctor<T,execution_space>::type data("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T AddLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_zero.data = data;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  struct AddFunctor<T,execution_space> f_add;
+  struct AddFunctor< T, execution_space > f_add;
+
   f_add.data = data;
-  Kokkos::parallel_for(loop,f_add);
+  Kokkos::parallel_for( loop, f_add );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T AddLoopSerial(int loop) {
+template< class T >
+T AddLoopSerial( int loop ) {
   T* data = new T[1];
   data[0] = 0;
 
-  for(int i=0;i<loop;i++)
-  *data+=(T)1;
+  for ( int i = 0; i < loop; i++ ) {
+    *data += (T) 1;
+  }
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
@@ -211,65 +230,69 @@ T AddLoopSerial(int loop) {
 //--------------atomic_compare_exchange-----------------
 //------------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct CASFunctor{
+template< class T, class DEVICE_TYPE >
+struct CASFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-	  T old = data();
-	  T newval, assumed;
-	  do {
-	    assumed = old;
-	    newval = assumed + (T)1;
-	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
-	  }
-	  while( old != assumed );
+  void operator()( int ) const {
+    T old = data();
+    T newval, assumed;
+
+    do {
+      assumed = old;
+      newval = assumed + (T) 1;
+      old = Kokkos::atomic_compare_exchange( &data(), assumed, newval );
+    } while( old != assumed );
   }
 };
 
-template<class T, class execution_space >
-T CASLoop(int loop) {
-  struct ZeroFunctor<T,execution_space> f_zero;
-  typename ZeroFunctor<T,execution_space>::type data("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T CASLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_zero.data = data;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  struct CASFunctor<T,execution_space> f_cas;
+  struct CASFunctor< T, execution_space > f_cas;
+
   f_cas.data = data;
-  Kokkos::parallel_for(loop,f_cas);
+  Kokkos::parallel_for( loop, f_cas );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
 
   return val;
 }
 
-template<class T>
-T CASLoopSerial(int loop) {
+template< class T >
+T CASLoopSerial( int loop ) {
   T* data = new T[1];
   data[0] = 0;
 
-  for(int i=0;i<loop;i++) {
-	  T assumed;
-	  T newval;
-	  T old;
-	  do {
-	    assumed = *data;
-	    newval = assumed + (T)1;
-	    old = *data;
-	    *data = newval;
-	  }
-	  while(!(assumed==old));
+  for ( int i = 0; i < loop; i++ ) {
+    T assumed;
+    T newval;
+    T old;
+
+    do {
+      assumed = *data;
+      newval = assumed + (T) 1;
+      old = *data;
+      *data = newval;
+    } while( !( assumed == old ) );
   }
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
@@ -277,109 +300,119 @@ T CASLoopSerial(int loop) {
 //--------------atomic_exchange-----------------
 //----------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct ExchFunctor{
+template< class T, class DEVICE_TYPE >
+struct ExchFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data, data2;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
-    T old = Kokkos::atomic_exchange(&data(),(T)i);
-    Kokkos::atomic_fetch_add(&data2(),old);
+  void operator()( int i ) const {
+    T old = Kokkos::atomic_exchange( &data(), (T) i );
+    Kokkos::atomic_fetch_add( &data2(), old );
   }
 };
 
-template<class T, class execution_space >
-T ExchLoop(int loop) {
-  struct ZeroFunctor<T,execution_space> f_zero;
-  typename ZeroFunctor<T,execution_space>::type data("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T ExchLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_zero.data = data;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  typename ZeroFunctor<T,execution_space>::type data2("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data2("HData");
+  typename ZeroFunctor< T, execution_space >::type data2( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data2( "HData" );
+
   f_zero.data = data2;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  struct ExchFunctor<T,execution_space> f_exch;
+  struct ExchFunctor< T, execution_space > f_exch;
+
   f_exch.data = data;
   f_exch.data2 = data2;
-  Kokkos::parallel_for(loop,f_exch);
+  Kokkos::parallel_for( loop, f_exch );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
-  Kokkos::deep_copy(h_data2,data2);
+  Kokkos::deep_copy( h_data, data );
+  Kokkos::deep_copy( h_data2, data2 );
   T val = h_data() + h_data2();
 
   return val;
 }
 
-template<class T>
-T ExchLoopSerial(typename std::conditional<!std::is_same<T,Kokkos::complex<double> >::value,int,void>::type loop) {
+template< class T >
+T ExchLoopSerial( typename std::conditional< !std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
   T* data = new T[1];
   T* data2 = new T[1];
   data[0] = 0;
   data2[0] = 0;
-  for(int i=0;i<loop;i++) {
-	T old = *data;
-	*data=(T) i;
-	*data2+=old;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T old = *data;
+    *data = (T) i;
+    *data2 += old;
   }
 
   T val = *data2 + *data;
   delete [] data;
   delete [] data2;
+
   return val;
 }
 
-template<class T>
-T ExchLoopSerial(typename std::conditional<std::is_same<T,Kokkos::complex<double> >::value,int,void>::type loop) {
+template< class T >
+T ExchLoopSerial( typename std::conditional< std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
   T* data = new T[1];
   T* data2 = new T[1];
   data[0] = 0;
   data2[0] = 0;
-  for(int i=0;i<loop;i++) {
-  T old = *data;
-  data->real() = (static_cast<double>(i));
-  data->imag() = 0;
-  *data2+=old;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T old = *data;
+    data->real() = ( static_cast<double>( i ) );
+    data->imag() = 0;
+    *data2 += old;
   }
 
   T val = *data2 + *data;
   delete [] data;
   delete [] data2;
+
   return val;
 }
 
-template<class T, class DeviceType >
-T LoopVariant(int loop, int test) {
-  switch (test) {
-    case 1: return AddLoop<T,DeviceType>(loop);
-    case 2: return CASLoop<T,DeviceType>(loop);
-    case 3: return ExchLoop<T,DeviceType>(loop);
+template< class T, class DeviceType >
+T LoopVariant( int loop, int test ) {
+  switch ( test ) {
+    case 1: return AddLoop< T, DeviceType >( loop );
+    case 2: return CASLoop< T, DeviceType >( loop );
+    case 3: return ExchLoop< T, DeviceType >( loop );
   }
+
   return 0;
 }
 
-template<class T>
-T LoopVariantSerial(int loop, int test) {
-  switch (test) {
-    case 1: return AddLoopSerial<T>(loop);
-    case 2: return CASLoopSerial<T>(loop);
-    case 3: return ExchLoopSerial<T>(loop);
+template< class T >
+T LoopVariantSerial( int loop, int test ) {
+  switch ( test ) {
+    case 1: return AddLoopSerial< T >( loop );
+    case 2: return CASLoopSerial< T >( loop );
+    case 3: return ExchLoopSerial< T >( loop );
   }
+
   return 0;
 }
 
-template<class T,class DeviceType>
-bool Loop(int loop, int test)
+template< class T, class DeviceType >
+bool Loop( int loop, int test )
 {
-  T res       = LoopVariant<T,DeviceType>(loop,test);
-  T resSerial = LoopVariantSerial<T>(loop,test);
+  T res       = LoopVariant< T, DeviceType >( loop, test );
+  T resSerial = LoopVariantSerial< T >( loop, test );
 
   bool passed = true;
 
@@ -387,16 +420,14 @@ bool Loop(int loop, int test)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = "
               << test << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-
-  return passed ;
-}
-
+  return passed;
 }
 
+} // namespace TestAtomic
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
index 7f1519045187c535c586659e757eeb24609ccb50..e3ceca404ff12c1c9e5da04bf70d183fee87dfdd 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -49,14 +49,16 @@ namespace TestAtomicOperations {
 //--------------zero_functor---------------------
 //-----------------------------------------------
 
-template<class T,class DEVICE_TYPE>
+template< class T, class DEVICE_TYPE >
 struct ZeroFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T,execution_space> type;
-  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
   type data;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
+  void operator()( int ) const {
     data() = 0;
   }
 };
@@ -65,78 +67,84 @@ struct ZeroFunctor {
 //--------------init_functor---------------------
 //-----------------------------------------------
 
-template<class T,class DEVICE_TYPE>
+template< class T, class DEVICE_TYPE >
 struct InitFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T,execution_space> type;
-  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
   type data;
-  T init_value ;
+  T init_value;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
+  void operator()( int ) const {
     data() = init_value;
   }
 
-  InitFunctor(T _init_value) : init_value(_init_value) {}
+  InitFunctor( T _init_value ) : init_value( _init_value ) {}
 };
 
-
 //---------------------------------------------------
 //--------------atomic_fetch_max---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct MaxFunctor{
+template< class T, class DEVICE_TYPE >
+struct MaxFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    //Kokkos::atomic_fetch_max(&data(),(T)1);
-    Kokkos::atomic_fetch_max(&data(),(T)i1);
+  void operator()( int ) const {
+    //Kokkos::atomic_fetch_max( &data(), (T) 1 );
+    Kokkos::atomic_fetch_max( &data(), (T) i1 );
   }
-  MaxFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+  MaxFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T MaxAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T MaxAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct MaxFunctor<T,execution_space> f(i0,i1);
+  struct MaxFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T MaxAtomicCheck(T i0 , T i1) {
+template< class T >
+T MaxAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = (i0 > i1 ? i0 : i1) ;
+  *data = ( i0 > i1 ? i0 : i1 );
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool MaxAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool MaxAtomicTest( T i0, T i1 )
 {
-  T res       = MaxAtomic<T,DeviceType>(i0,i1);
-  T resSerial = MaxAtomicCheck<T>(i0,i1);
+  T res       = MaxAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MaxAtomicCheck<T>( i0, i1 );
 
   bool passed = true;
 
@@ -144,71 +152,77 @@ bool MaxAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MaxAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_min---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct MinFunctor{
+template< class T, class DEVICE_TYPE >
+struct MinFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_min(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_min( &data(), (T) i1 );
   }
-  MinFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  MinFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T MinAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T MinAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct MinFunctor<T,execution_space> f(i0,i1);
+  struct MinFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T MinAtomicCheck(T i0 , T i1) {
+template< class T >
+T MinAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = (i0 < i1 ? i0 : i1) ;
+  *data = ( i0 < i1 ? i0 : i1 );
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool MinAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool MinAtomicTest( T i0, T i1 )
 {
-  T res       = MinAtomic<T,DeviceType>(i0,i1);
-  T resSerial = MinAtomicCheck<T>(i0,i1);
+  T res       = MinAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MinAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -216,55 +230,60 @@ bool MinAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MinAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_increment---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct IncFunctor{
+template< class T, class DEVICE_TYPE >
+struct IncFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_increment(&data());
+  void operator()( int ) const {
+    Kokkos::atomic_increment( &data() );
   }
-  IncFunctor( T _i0 ) : i0(_i0) {}
+
+  IncFunctor( T _i0 ) : i0( _i0 ) {}
 };
 
-template<class T, class execution_space >
-T IncAtomic(T i0) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T IncAtomic( T i0 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct IncFunctor<T,execution_space> f(i0);
+  struct IncFunctor< T, execution_space > f( i0 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T IncAtomicCheck(T i0) {
+template< class T >
+T IncAtomicCheck( T i0 ) {
   T* data = new T[1];
   data[0] = 0;
 
@@ -272,14 +291,15 @@ T IncAtomicCheck(T i0) {
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool IncAtomicTest(T i0)
+template< class T, class DeviceType >
+bool IncAtomicTest( T i0 )
 {
-  T res       = IncAtomic<T,DeviceType>(i0);
-  T resSerial = IncAtomicCheck<T>(i0);
+  T res       = IncAtomic< T, DeviceType >( i0 );
+  T resSerial = IncAtomicCheck< T >( i0 );
 
   bool passed = true;
 
@@ -287,55 +307,60 @@ bool IncAtomicTest(T i0)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = IncAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_decrement---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct DecFunctor{
+template< class T, class DEVICE_TYPE >
+struct DecFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_decrement(&data());
+  void operator()( int ) const {
+    Kokkos::atomic_decrement( &data() );
   }
-  DecFunctor( T _i0 ) : i0(_i0) {}
+
+  DecFunctor( T _i0 ) : i0( _i0 ) {}
 };
 
-template<class T, class execution_space >
-T DecAtomic(T i0) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T DecAtomic( T i0 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct DecFunctor<T,execution_space> f(i0);
+  struct DecFunctor< T, execution_space > f( i0 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T DecAtomicCheck(T i0) {
+template< class T >
+T DecAtomicCheck( T i0 ) {
   T* data = new T[1];
   data[0] = 0;
 
@@ -343,14 +368,15 @@ T DecAtomicCheck(T i0) {
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool DecAtomicTest(T i0)
+template< class T, class DeviceType >
+bool DecAtomicTest( T i0 )
 {
-  T res       = DecAtomic<T,DeviceType>(i0);
-  T resSerial = DecAtomicCheck<T>(i0);
+  T res       = DecAtomic< T, DeviceType >( i0 );
+  T resSerial = DecAtomicCheck< T >( i0 );
 
   bool passed = true;
 
@@ -358,71 +384,77 @@ bool DecAtomicTest(T i0)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = DecAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_mul---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct MulFunctor{
+template< class T, class DEVICE_TYPE >
+struct MulFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_mul(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_mul( &data(), (T) i1 );
   }
-  MulFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  MulFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T MulAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T MulAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct MulFunctor<T,execution_space> f(i0,i1);
+  struct MulFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T MulAtomicCheck(T i0 , T i1) {
+template< class T >
+T MulAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0*i1 ;
+  *data = i0*i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool MulAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool MulAtomicTest( T i0, T i1 )
 {
-  T res       = MulAtomic<T,DeviceType>(i0,i1);
-  T resSerial = MulAtomicCheck<T>(i0,i1);
+  T res       = MulAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MulAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -430,71 +462,77 @@ bool MulAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MulAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_div---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct DivFunctor{
+template< class T, class DEVICE_TYPE >
+struct DivFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_div(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_div( &data(), (T) i1 );
   }
-  DivFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  DivFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T DivAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T DivAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct DivFunctor<T,execution_space> f(i0,i1);
+  struct DivFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T DivAtomicCheck(T i0 , T i1) {
+template< class T >
+T DivAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0/i1 ;
+  *data = i0 / i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool DivAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool DivAtomicTest( T i0, T i1 )
 {
-  T res       = DivAtomic<T,DeviceType>(i0,i1);
-  T resSerial = DivAtomicCheck<T>(i0,i1);
+  T res       = DivAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = DivAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -502,71 +540,77 @@ bool DivAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = DivAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_mod---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct ModFunctor{
+template< class T, class DEVICE_TYPE >
+struct ModFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_mod(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_mod( &data(), (T) i1 );
   }
-  ModFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  ModFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T ModAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T ModAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct ModFunctor<T,execution_space> f(i0,i1);
+  struct ModFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T ModAtomicCheck(T i0 , T i1) {
+template< class T >
+T ModAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0%i1 ;
+  *data = i0 % i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool ModAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool ModAtomicTest( T i0, T i1 )
 {
-  T res       = ModAtomic<T,DeviceType>(i0,i1);
-  T resSerial = ModAtomicCheck<T>(i0,i1);
+  T res       = ModAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = ModAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -574,71 +618,77 @@ bool ModAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = ModAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_and---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct AndFunctor{
+template< class T, class DEVICE_TYPE >
+struct AndFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_and(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_and( &data(), (T) i1 );
   }
-  AndFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  AndFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T AndAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T AndAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct AndFunctor<T,execution_space> f(i0,i1);
+  struct AndFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T AndAtomicCheck(T i0 , T i1) {
+template< class T >
+T AndAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0&i1 ;
+  *data = i0 & i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool AndAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool AndAtomicTest( T i0, T i1 )
 {
-  T res       = AndAtomic<T,DeviceType>(i0,i1);
-  T resSerial = AndAtomicCheck<T>(i0,i1);
+  T res       = AndAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = AndAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -646,71 +696,77 @@ bool AndAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = AndAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_or----------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct OrFunctor{
+template< class T, class DEVICE_TYPE >
+struct OrFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_or(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_or( &data(), (T) i1 );
   }
-  OrFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  OrFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T OrAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T OrAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct OrFunctor<T,execution_space> f(i0,i1);
+  struct OrFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T OrAtomicCheck(T i0 , T i1) {
+template< class T >
+T OrAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0|i1 ;
+  *data = i0 | i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool OrAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool OrAtomicTest( T i0, T i1 )
 {
-  T res       = OrAtomic<T,DeviceType>(i0,i1);
-  T resSerial = OrAtomicCheck<T>(i0,i1);
+  T res       = OrAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = OrAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -718,71 +774,77 @@ bool OrAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = OrAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_xor---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct XorFunctor{
+template< class T, class DEVICE_TYPE >
+struct XorFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_xor(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_xor( &data(), (T) i1 );
   }
-  XorFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  XorFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T XorAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T XorAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct XorFunctor<T,execution_space> f(i0,i1);
+  struct XorFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T XorAtomicCheck(T i0 , T i1) {
+template< class T >
+T XorAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0^i1 ;
+  *data = i0 ^ i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool XorAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool XorAtomicTest( T i0, T i1 )
 {
-  T res       = XorAtomic<T,DeviceType>(i0,i1);
-  T resSerial = XorAtomicCheck<T>(i0,i1);
+  T res       = XorAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = XorAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -790,71 +852,77 @@ bool XorAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = XorAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_lshift---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct LShiftFunctor{
+template< class T, class DEVICE_TYPE >
+struct LShiftFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_lshift(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_lshift( &data(), (T) i1 );
   }
-  LShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  LShiftFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T LShiftAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T LShiftAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct LShiftFunctor<T,execution_space> f(i0,i1);
+  struct LShiftFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T LShiftAtomicCheck(T i0 , T i1) {
+template< class T >
+T LShiftAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0<<i1 ;
+  *data = i0 << i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool LShiftAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool LShiftAtomicTest( T i0, T i1 )
 {
-  T res       = LShiftAtomic<T,DeviceType>(i0,i1);
-  T resSerial = LShiftAtomicCheck<T>(i0,i1);
+  T res       = LShiftAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = LShiftAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -862,71 +930,77 @@ bool LShiftAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = LShiftAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_rshift---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct RShiftFunctor{
+template< class T, class DEVICE_TYPE >
+struct RShiftFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_rshift(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_rshift( &data(), (T) i1 );
   }
-  RShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  RShiftFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T RShiftAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T RShiftAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct RShiftFunctor<T,execution_space> f(i0,i1);
+  struct RShiftFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T RShiftAtomicCheck(T i0 , T i1) {
+template< class T >
+T RShiftAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0>>i1 ;
+  *data = i0 >> i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool RShiftAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool RShiftAtomicTest( T i0, T i1 )
 {
-  T res       = RShiftAtomic<T,DeviceType>(i0,i1);
-  T resSerial = RShiftAtomicCheck<T>(i0,i1);
+  T res       = RShiftAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = RShiftAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -934,52 +1008,52 @@ bool RShiftAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = RShiftAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //--------------atomic_test_control------------------
 //---------------------------------------------------
 
-template<class T,class DeviceType>
-bool AtomicOperationsTestIntegralType( int i0 , int i1 , int test )
+template< class T, class DeviceType >
+bool AtomicOperationsTestIntegralType( int i0, int i1, int test )
 {
-  switch (test) {
-    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 5: return ModAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 6: return AndAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 7: return OrAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 8: return XorAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 9: return LShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 10: return RShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 11: return IncAtomicTest<T,DeviceType>( (T)i0 );
-    case 12: return DecAtomicTest<T,DeviceType>( (T)i0 );
+  switch ( test ) {
+    case 1: return MaxAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 2: return MinAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 3: return MulAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 4: return DivAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 5: return ModAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 6: return AndAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 7: return OrAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 8: return XorAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 9: return LShiftAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 10: return RShiftAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 11: return IncAtomicTest< T, DeviceType >( (T) i0 );
+    case 12: return DecAtomicTest< T, DeviceType >( (T) i0 );
   }
+
   return 0;
 }
 
-template<class T,class DeviceType>
-bool AtomicOperationsTestNonIntegralType( int i0 , int i1 , int test )
+template< class T, class DeviceType >
+bool AtomicOperationsTestNonIntegralType( int i0, int i1, int test )
 {
-  switch (test) {
-    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+  switch ( test ) {
+    case 1: return MaxAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 2: return MinAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 3: return MulAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 4: return DivAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
   }
+
   return 0;
 }
 
-} // namespace
-
+} // namespace TestAtomicOperations
diff --git a/lib/kokkos/core/unit_test/TestAtomicViews.hpp b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
index 739492d32f806a80d1b64f10e3d0ba887f627acd..71080e5c8216aecd01985139c37bb68931139929 100644
--- a/lib/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -49,56 +49,52 @@ namespace TestAtomicViews {
 //-----------atomic view api tests-----------------
 //-------------------------------------------------
 
-template< class T , class ... P >
-size_t allocation_count( const Kokkos::View<T,P...> & view )
+template< class T, class ... P >
+size_t allocation_count( const Kokkos::View< T, P... > & view )
 {
   const size_t card  = view.size();
   const size_t alloc = view.span();
 
-  const int memory_span = Kokkos::View<int*>::required_allocation_size(100);
+  const int memory_span = Kokkos::View< int* >::required_allocation_size( 100 );
 
-  return (card <= alloc && memory_span == 400) ? alloc : 0 ;
+  return ( card <= alloc && memory_span == 400 ) ? alloc : 0;
 }
 
-template< class DataType ,
-          class DeviceType ,
+template< class DataType,
+          class DeviceType,
           unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
-struct TestViewOperator_LeftAndRight ;
+struct TestViewOperator_LeftAndRight;
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 1 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+    { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
+    { update = 0; }
 
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > left_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space, Kokkos::MemoryTraits< Kokkos::Atomic > > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > right_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space, Kokkos::MemoryTraits< Kokkos::Atomic > > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space, Kokkos::MemoryTraits< Kokkos::Atomic >> stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -111,357 +107,338 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
-      // below checks that values match, but unable to check the references 
-      // - should this be able to be checked?
-      if ( left(i0)  != left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( right(i0) != right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
-      if ( left(i0)  != left_stride(i0) ) { update |= 4 ; }
-      if ( right(i0) != right_stride(i0) ) { update |= 8 ; }
-      /*
-      if ( & left(i0)  != & left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0) != & right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
-      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
-      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
-      */
+      // Below checks that values match, but unable to check the references.
+      // Should this be able to be checked?
+      if ( left( i0 )  != left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( right( i0 ) != right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( left( i0 )  != left_stride( i0 ) ) { update |= 4; }
+      if ( right( i0 ) != right_stride( i0 ) ) { update |= 8; }
+/*
+      if ( &left( i0 )  != &left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( &right( i0 ) != &right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( &left( i0 )  != &left_stride( i0 ) ) { update |= 4; }
+      if ( &right( i0 ) != &right_stride( i0 ) ) { update |= 8; }
+*/
     }
   }
 };
 
-
 template< typename T, class DeviceType >
 class TestAtomicViewAPI
 {
 public:
-  typedef DeviceType        device ;
+  typedef DeviceType device;
 
-  enum { N0 = 1000 ,
-         N1 = 3 ,
-         N2 = 5 ,
+  enum { N0 = 1000,
+         N1 = 3,
+         N2 = 5,
          N3 = 7 };
 
-  typedef Kokkos::View< T , device > dView0 ;
-  typedef Kokkos::View< T* , device > dView1 ;
-  typedef Kokkos::View< T*[N1] , device > dView2 ;
-  typedef Kokkos::View< T*[N1][N2] , device > dView3 ;
-  typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ;
-  typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ;
-  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ;
-  typedef typename dView0::host_mirror_space host ;
+  typedef Kokkos::View< T, device > dView0;
+  typedef Kokkos::View< T*, device > dView1;
+  typedef Kokkos::View< T*[N1], device > dView2;
+  typedef Kokkos::View< T*[N1][N2], device > dView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device > dView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device > const_dView4;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged;
+  typedef typename dView0::host_mirror_space host;
 
-  typedef Kokkos::View< T , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView0 ;
-  typedef Kokkos::View< T* , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView1 ;
-  typedef Kokkos::View< T*[N1] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView2 ;
-  typedef Kokkos::View< T*[N1][N2] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView3 ;
-  typedef Kokkos::View< T*[N1][N2][N3] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView4 ;
-  typedef Kokkos::View< const T*[N1][N2][N3] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > const_aView4 ;
+  typedef Kokkos::View< T, device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView0;
+  typedef Kokkos::View< T*, device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView1;
+  typedef Kokkos::View< T*[N1], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView2;
+  typedef Kokkos::View< T*[N1][N2], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device, Kokkos::MemoryTraits< Kokkos::Atomic > > const_aView4;
 
-  typedef Kokkos::View< T****, device, Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::Atomic > > aView4_unmanaged ;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::Atomic > > aView4_unmanaged;
 
-  typedef typename aView0::host_mirror_space host_atomic ;
+  typedef typename aView0::host_mirror_space host_atomic;
 
   TestAtomicViewAPI()
   {
-    TestViewOperator_LeftAndRight< int[2] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2], device >::testit();
     run_test_rank0();
     run_test_rank4();
     run_test_const();
   }
 
-
   static void run_test_rank0()
   {
-    dView0 dx , dy ;
-    aView0 ax , ay , az ;
+    dView0 dx, dy;
+    aView0 ax, ay, az;
 
     dx = dView0( "dx" );
     dy = dView0( "dy" );
-    ASSERT_EQ( dx.use_count() , size_t(1) );
-    ASSERT_EQ( dy.use_count() , size_t(1) );
-
-    ax = dx ;
-    ay = dy ;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
-    ASSERT_EQ( dy.use_count() , size_t(2) );
-    ASSERT_EQ( dx.use_count() , ax.use_count() );
-
-    az = ax ;
-    ASSERT_EQ( dx.use_count() , size_t(3) );
-    ASSERT_EQ( ax.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , ax.use_count() );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 1 ) );
+
+    ax = dx;
+    ay = dy;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dx.use_count(), ax.use_count() );
+
+    az = ax;
+    ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), ax.use_count() );
   }
 
   static void run_test_rank4()
   {
-    dView4 dx , dy ;
-    aView4 ax , ay , az ;
+    dView4 dx, dy;
+    aView4 ax, ay, az;
 
-    dx = dView4( "dx" , N0 );
-    dy = dView4( "dy" , N0 );
-    ASSERT_EQ( dx.use_count() , size_t(1) );
-    ASSERT_EQ( dy.use_count() , size_t(1) );
+    dx = dView4( "dx", N0 );
+    dy = dView4( "dy", N0 );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 1 ) );
 
-    ax = dx ;
-    ay = dy ;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
-    ASSERT_EQ( dy.use_count() , size_t(2) );
-    ASSERT_EQ( dx.use_count() , ax.use_count() );
+    ax = dx;
+    ay = dy;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dx.use_count(), ax.use_count() );
 
     dView4_unmanaged unmanaged_dx = dx;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
-    az = ax ;
-    ASSERT_EQ( dx.use_count() , size_t(3) );
-    ASSERT_EQ( ax.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , ax.use_count() );
+    az = ax;
+    ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), ax.use_count() );
 
     aView4_unmanaged unmanaged_ax = ax;
-    ASSERT_EQ( ax.use_count() , size_t(3) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
 
-    aView4_unmanaged unmanaged_ax_from_ptr_dx = aView4_unmanaged(dx.data(),
-                                                              dx.dimension_0(),
-                                                              dx.dimension_1(),
-                                                              dx.dimension_2(),
-                                                              dx.dimension_3());
-    ASSERT_EQ( ax.use_count() , size_t(3) );
+    aView4_unmanaged unmanaged_ax_from_ptr_dx =
+      aView4_unmanaged( dx.data(), dx.dimension_0(), dx.dimension_1(), dx.dimension_2(), dx.dimension_3() );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
 
-    const_aView4 const_ax = ax ;
-    ASSERT_EQ( ax.use_count() , size_t(4) );
-    ASSERT_EQ( const_ax.use_count() , ax.use_count() );
+    const_aView4 const_ax = ax;
+    ASSERT_EQ( ax.use_count(), size_t( 4 ) );
+    ASSERT_EQ( const_ax.use_count(), ax.use_count() );
 
     ASSERT_FALSE( ax.data() == 0 );
     ASSERT_FALSE( const_ax.data() == 0 ); // referenceable ptr
     ASSERT_FALSE( unmanaged_ax.data() == 0 );
     ASSERT_FALSE( unmanaged_ax_from_ptr_dx.data() == 0 );
     ASSERT_FALSE( ay.data() == 0 );
-//    ASSERT_NE( ax , ay );
+//    ASSERT_NE( ax, ay );
 //    Above test results in following runtime error from gtest:
 //    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F 00-00 00-31 44-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00> vs 32-byte object <80-01 D0-A0 D8-7F 00-00 00-A1 4A-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00>
 
-    ASSERT_EQ( ax.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( ax.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( ax.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( ax.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( ax.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( ax.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( ax.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( ax.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( ay.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( ay.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( ay.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( ay.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( ay.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( ay.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( ay.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( ay.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( unmanaged_ax_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+    ASSERT_EQ( unmanaged_ax_from_ptr_dx.capacity(), unsigned( N0 ) * unsigned( N1 ) * unsigned( N2 ) * unsigned( N3 ) );
   }
 
-  typedef T DataType[2] ;
+  typedef T DataType[2];
 
   static void
   check_auto_conversion_to_const(
-     const Kokkos::View< const DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > & arg_const ,
-     const Kokkos::View< const DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > & arg )
+     const Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > & arg_const,
+     const Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > & arg )
   {
     ASSERT_TRUE( arg_const == arg );
   }
 
   static void run_test_const()
   {
-    typedef Kokkos::View< DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > typeX ;
-    typedef Kokkos::View< const DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > const_typeX ;
+    typedef Kokkos::View< DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > typeX;
+    typedef Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > const_typeX;
 
     typeX x( "X" );
-    const_typeX xc = x ;
+    const_typeX xc = x;
 
     //ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
     //ASSERT_TRUE( x == xc );
 
-    check_auto_conversion_to_const( x , xc );
+    check_auto_conversion_to_const( x, xc );
   }
-
 };
 
-
 //---------------------------------------------------
 //-----------initialization functors-----------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct InitFunctor_Seq {
+  typedef Kokkos::View< T*, execution_space > view_type;
 
-  typedef Kokkos::View< T* , execution_space > view_type ;
-
-  view_type input ; 
-  const long length ;
+  view_type input;
+  const long length;
 
-  InitFunctor_Seq( view_type & input_ , const long length_ ) 
-    : input(input_)
-    , length(length_)
+  InitFunctor_Seq( view_type & input_, const long length_ )
+    : input( input_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const long i ) const {
     if ( i < length ) {
-      input(i) = (T) i ;
+      input( i ) = (T) i;
     }
   }
-
 };
 
-
 template<class T, class execution_space >
 struct InitFunctor_ModTimes {
+  typedef Kokkos::View< T*, execution_space > view_type;
 
-  typedef Kokkos::View< T* , execution_space > view_type ;
-
-  view_type input ; 
-  const long length ;
-  const long remainder ;
+  view_type input;
+  const long length;
+  const long remainder;
 
-  InitFunctor_ModTimes( view_type & input_ , const long length_ , const long remainder_ ) 
-    : input(input_)
-    , length(length_)
-    , remainder(remainder_)
+  InitFunctor_ModTimes( view_type & input_, const long length_, const long remainder_ )
+    : input( input_ )
+    , length( length_ )
+    , remainder( remainder_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const long i ) const {
     if ( i < length ) {
-      if ( i % (remainder+1) == remainder ) {
-        input(i) = (T)2 ;
+      if ( i % ( remainder + 1 ) == remainder ) {
+        input( i ) = (T) 2;
       }
       else {
-        input(i) = (T)1 ;
+        input( i ) = (T) 1;
       }
     }
   }
 };
 
-
 template<class T, class execution_space >
 struct InitFunctor_ModShift {
+  typedef Kokkos::View< T*, execution_space > view_type;
 
-  typedef Kokkos::View< T* , execution_space > view_type ;
-
-  view_type input ; 
-  const long length ;
-  const long remainder ;
+  view_type input;
+  const long length;
+  const long remainder;
 
-  InitFunctor_ModShift( view_type & input_ , const long length_ , const long remainder_ ) 
-    : input(input_)
-    , length(length_)
-    , remainder(remainder_)
+  InitFunctor_ModShift( view_type & input_, const long length_, const long remainder_ )
+    : input( input_ )
+    , length( length_ )
+    , remainder( remainder_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const long i ) const {
     if ( i < length ) {
-      if ( i % (remainder+1) == remainder ) {
-        input(i) = 1 ;
+      if ( i % ( remainder + 1 ) == remainder ) {
+        input( i ) = 1;
       }
     }
   }
 };
 
-
 //---------------------------------------------------
 //-----------atomic view plus-equal------------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct PlusEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
   // Wrap the result view in an atomic view, use this for operator
-  PlusEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  PlusEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) += input(i);
+        even_odd_result( 0 ) += input( i );
       }
       else {
-        even_odd_result(1) += input(i);
+        even_odd_result( 1 ) += input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T PlusEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T PlusEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length) , init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  PlusEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  PlusEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0) + h_result_view(1) ) ;
+  return (T) ( h_result_view( 0 ) + h_result_view( 1 ) );
 }
 
-template<class T>
+template< class T >
 T PlusEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
   T result[2];
+
   if ( N % 2 == 0 ) {
-    const long half_sum_end = (N/2) - 1;
+    const long half_sum_end = ( N / 2 ) - 1;
     const long full_sum_end = N - 1;
-    result[0] = half_sum_end*(half_sum_end + 1)/2 ; //even sum
-    result[1] = ( full_sum_end*(full_sum_end + 1)/2 ) - result[0] ; // odd sum
+    result[0] = half_sum_end * ( half_sum_end + 1 ) / 2; // Even sum.
+    result[1] = ( full_sum_end * ( full_sum_end + 1 ) / 2 ) - result[0]; // Odd sum.
   }
   else {
-    const long half_sum_end = (T)(N/2) ;
+    const long half_sum_end = (T) ( N / 2 );
     const long full_sum_end = N - 2;
-    result[0] = half_sum_end*(half_sum_end - 1)/2 ; //even sum
-    result[1] = ( full_sum_end*(full_sum_end - 1)/2 ) - result[0] ; // odd sum
+    result[0] = half_sum_end * ( half_sum_end - 1 ) / 2; // Even sum.
+    result[1] = ( full_sum_end * ( full_sum_end - 1 ) / 2 ) - result[0]; // Odd sum.
   }
 
-  return (T)(result[0] + result[1]);
+  return (T) ( result[0] + result[1] );
 }
 
-template<class T,class DeviceType>
-bool PlusEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool PlusEqualAtomicViewTest( long input_length )
 {
-  T res       = PlusEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = PlusEqualAtomicViewCheck<T>(input_length);
+  T res       = PlusEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = PlusEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -469,104 +446,98 @@ bool PlusEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = PlusEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view minus-equal-----------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct MinusEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  MinusEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  MinusEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) -= input(i);
+        even_odd_result( 0 ) -= input( i );
       }
       else {
-        even_odd_result(1) -= input(i);
+        even_odd_result( 1 ) -= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T MinusEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T MinusEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  MinusEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  MinusEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0) + h_result_view(1) ) ;
+  return (T) ( h_result_view( 0 ) + h_result_view( 1 ) );
 }
 
-template<class T>
+template< class T >
 T MinusEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
   T result[2];
+
   if ( N % 2 == 0 ) {
-    const long half_sum_end = (N/2) - 1;
+    const long half_sum_end = ( N / 2 ) - 1;
     const long full_sum_end = N - 1;
-    result[0] = -1*( half_sum_end*(half_sum_end + 1)/2 ) ; //even sum
-    result[1] = -1*( ( full_sum_end*(full_sum_end + 1)/2 ) + result[0] ) ; // odd sum
+    result[0] = -1 * ( half_sum_end * ( half_sum_end + 1 ) / 2 ); // Even sum.
+    result[1] = -1 * ( ( full_sum_end * ( full_sum_end + 1 ) / 2 ) + result[0] ); // Odd sum.
   }
   else {
-    const long half_sum_end = (long)(N/2) ;
+    const long half_sum_end = (long) ( N / 2 );
     const long full_sum_end = N - 2;
-    result[0] = -1*( half_sum_end*(half_sum_end - 1)/2 ) ; //even sum
-    result[1] = -1*( ( full_sum_end*(full_sum_end - 1)/2 ) + result[0] ) ; // odd sum
+    result[0] = -1 * ( half_sum_end * ( half_sum_end - 1 ) / 2 ); // Even sum.
+    result[1] = -1 * ( ( full_sum_end * ( full_sum_end - 1 ) / 2 ) + result[0] ); // Odd sum.
   }
 
-  return (result[0] + result[1]);
+  return ( result[0] + result[1] );
 }
 
-template<class T,class DeviceType>
-bool MinusEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool MinusEqualAtomicViewTest( long input_length )
 {
-  T res       = MinusEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = MinusEqualAtomicViewCheck<T>(input_length);
+  T res       = MinusEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = MinusEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -574,83 +545,76 @@ bool MinusEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MinusEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view times-equal-----------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct TimesEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
 
   // Wrap the result view in an atomic view, use this for operator
-  TimesEqualAtomicViewFunctor( const view_type & input_ , view_type & result_ , const long length_) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
+  TimesEqualAtomicViewFunctor( const view_type & input_, view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length && i > 0 ) {
-      result(0) *= (double)input(i);
+      result( 0 ) *= (double) input( i );
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T TimesEqualAtomicView(const long input_length, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T TimesEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",1) ;
-  deep_copy(result_view, 1.0);
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 1 );
+  deep_copy( result_view, 1.0 );
 
-  InitFunctor_ModTimes<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  TimesEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  TimesEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T TimesEqualAtomicViewCheck( const long input_length, const long remainder ) {
-
-  //Analytical result
+  // Analytical result.
   const long N = input_length;
   T result = 1.0;
 
   for ( long i = 2; i < N; ++i ) {
-    if ( i % (remainder+1) == remainder ) { 
+    if ( i % ( remainder + 1 ) == remainder ) {
       result *= 2.0;
     }
     else {
@@ -658,15 +622,15 @@ T TimesEqualAtomicViewCheck( const long input_length, const long remainder ) {
     }
   }
 
-  return (T)result;
+  return (T) result;
 }
 
-template<class T, class DeviceType>
-bool TimesEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType>
+bool TimesEqualAtomicViewTest( const long input_length )
 {
   const long remainder = 23;
-  T res       = TimesEqualAtomicView<T,DeviceType>(input_length, remainder);
-  T resSerial = TimesEqualAtomicViewCheck<T>(input_length, remainder);
+  T res       = TimesEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = TimesEqualAtomicViewCheck< T >( input_length, remainder );
 
   bool passed = true;
 
@@ -674,101 +638,93 @@ bool TimesEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = TimesEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view div-equal------------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct DivEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
+  typedef Kokkos::View< T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  DivEqualAtomicViewFunctor( const view_type & input_ , scalar_view_type & result_ , const long length_) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  DivEqualAtomicViewFunctor( const view_type & input_, scalar_view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length && i > 0 ) {
-      result() /= (double)(input(i));
+      result() /= (double) ( input( i ) );
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T DivEqualAtomicView(const long input_length, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
-  typedef typename scalar_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T DivEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+  typedef typename scalar_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  scalar_view_type result_view("result_view") ;
-  Kokkos::deep_copy(result_view, 12121212121);
+  view_type input( "input_view", length );
+  scalar_view_type result_view( "result_view" );
+  Kokkos::deep_copy( result_view, 12121212121 );
 
-  InitFunctor_ModTimes<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  DivEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  DivEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view()) ;
+  return (T) ( h_result_view() );
 }
 
-template<class T>
-T DivEqualAtomicViewCheck( const long input_length , const long remainder ) {
-
+template< class T >
+T DivEqualAtomicViewCheck( const long input_length, const long remainder ) {
   const long N = input_length;
   T result = 12121212121.0;
   for ( long i = 2; i < N; ++i ) {
-    if ( i % (remainder+1) == remainder ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
       result /= 1.0;
     }
     else {
       result /= 2.0;
     }
-
   }
 
-  return (T)result;
+  return (T) result;
 }
 
-template<class T, class DeviceType>
-bool DivEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool DivEqualAtomicViewTest( const long input_length )
 {
   const long remainder = 23;
 
-  T res       = DivEqualAtomicView<T,DeviceType>(input_length, remainder);
-  T resSerial = DivEqualAtomicViewCheck<T>(input_length, remainder);
+  T res       = DivEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = DivEqualAtomicViewCheck< T >( input_length, remainder );
 
   bool passed = true;
 
@@ -776,83 +732,76 @@ bool DivEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = DivEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view mod-equal------------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct ModEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
+  typedef Kokkos::View< T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  ModEqualAtomicViewFunctor( const view_type & input_ , scalar_view_type & result_ , const long length_) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  ModEqualAtomicViewFunctor( const view_type & input_, scalar_view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length && i > 0 ) {
-      result() %= (double)(input(i));
+      result() %= (double) ( input( i ) );
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T ModEqualAtomicView(const long input_length, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
-  typedef typename scalar_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T ModEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+  typedef typename scalar_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  scalar_view_type result_view("result_view") ;
-  Kokkos::deep_copy(result_view, 12121212121);
+  view_type input( "input_view", length );
+  scalar_view_type result_view( "result_view" );
+  Kokkos::deep_copy( result_view, 12121212121 );
 
-  InitFunctor_ModTimes<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  ModEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  ModEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view()) ;
+  return (T) ( h_result_view() );
 }
 
-template<class T>
-T ModEqualAtomicViewCheck( const long input_length , const long remainder ) {
-
+template< class T >
+T ModEqualAtomicViewCheck( const long input_length, const long remainder ) {
   const long N = input_length;
   T result = 12121212121;
   for ( long i = 2; i < N; ++i ) {
-    if ( i % (remainder+1) == remainder ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
       result %= 1;
     }
     else {
@@ -860,19 +809,18 @@ T ModEqualAtomicViewCheck( const long input_length , const long remainder ) {
     }
   }
 
-  return (T)result;
+  return (T) result;
 }
 
-template<class T, class DeviceType>
-bool ModEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool ModEqualAtomicViewTest( const long input_length )
 {
-
-  static_assert( std::is_integral<T>::value, "ModEqualAtomicView Error: Type must be integral type for this unit test");
+  static_assert( std::is_integral< T >::value, "ModEqualAtomicView Error: Type must be integral type for this unit test" );
 
   const long remainder = 23;
 
-  T res       = ModEqualAtomicView<T,DeviceType>(input_length, remainder);
-  T resSerial = ModEqualAtomicViewCheck<T>(input_length, remainder);
+  T res       = ModEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = ModEqualAtomicViewCheck< T >( input_length, remainder );
 
   bool passed = true;
 
@@ -880,142 +828,134 @@ bool ModEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = ModEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view rs-equal------------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct RSEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T**** , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
+  typedef Kokkos::View< T****, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
 
   const view_type input;
   atomic_view_type result;
   const long length;
   const long value;
 
-  // Wrap the result view in an atomic view, use this for operator
-  RSEqualAtomicViewFunctor( const view_type & input_ , result_view_type & result_ , const long & length_ , const long & value_ ) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
-    , value(value_)
+  // Wrap the result view in an atomic view, use this for operator.
+  RSEqualAtomicViewFunctor( const view_type & input_, result_view_type & result_, const long & length_, const long & value_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+    , value( value_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 4 == 0 ) {
-        result(1,0,0,0) >>= input(i);
+        result( 1, 0, 0, 0 ) >>= input( i );
       }
       else if ( i % 4 == 1 ) {
-        result(0,1,0,0) >>= input(i);
+        result( 0, 1, 0, 0 ) >>= input( i );
       }
       else if ( i % 4 == 2 ) {
-        result(0,0,1,0) >>= input(i);
+        result( 0, 0, 1, 0 ) >>= input( i );
       }
       else if ( i % 4 == 3 ) {
-        result(0,0,0,1) >>= input(i);
+        result( 0, 0, 0, 1 ) >>= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T RSEqualAtomicView(const long input_length, const long value, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
-  typedef typename result_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T RSEqualAtomicView( const long input_length, const long value, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+  typedef typename result_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  result_view_type result_view("result_view",2,2,2,2) ;
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-    h_result_view(1,0,0,0) = value;
-    h_result_view(0,1,0,0) = value;
-    h_result_view(0,0,1,0) = value;
-    h_result_view(0,0,0,1) = value;
-  Kokkos::deep_copy( result_view , h_result_view );
+  view_type input( "input_view", length );
+  result_view_type result_view( "result_view", 2, 2, 2, 2 );
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  h_result_view( 1, 0, 0, 0 ) = value;
+  h_result_view( 0, 1, 0, 0 ) = value;
+  h_result_view( 0, 0, 1, 0 ) = value;
+  h_result_view( 0, 0, 0, 1 ) = value;
+  Kokkos::deep_copy( result_view, h_result_view );
 
+  InitFunctor_ModShift< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  InitFunctor_ModShift<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
-
-  RSEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length, value);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  RSEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length, value );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  Kokkos::deep_copy(h_result_view, result_view);
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(1,0,0,0)) ; 
+  return (T) ( h_result_view( 1, 0, 0, 0 ) );
 }
 
-template<class T>
+template< class T >
 T RSEqualAtomicViewCheck( const long input_length, const long value, const long remainder ) {
-
-  T result[4] ;
-  result[0] = value ;
-  result[1] = value ;
-  result[2] = value ;
-  result[3] = value ;
+  T result[4];
+  result[0] = value;
+  result[1] = value;
+  result[2] = value;
+  result[3] = value;
 
   T * input = new T[input_length];
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % (remainder+1) == remainder ) {
-        input[i] = 1;
-      }
-      else {
-        input[i] = 0;
-      }
+    if ( i % ( remainder + 1 ) == remainder ) {
+      input[i] = 1;
+    }
+    else {
+      input[i] = 0;
+    }
   }
 
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % 4 == 0 ) {
-        result[0] >>= input[i];
-      }
-      else if ( i % 4 == 1 ) {
-        result[1] >>= input[i];
-      }
-      else if ( i % 4 == 2 ) {
-        result[2] >>= input[i];
-      }
-      else if ( i % 4 == 3 ) {
-        result[3] >>= input[i];
-      }
+    if ( i % 4 == 0 ) {
+      result[0] >>= input[i];
+    }
+    else if ( i % 4 == 1 ) {
+      result[1] >>= input[i];
+    }
+    else if ( i % 4 == 2 ) {
+      result[2] >>= input[i];
+    }
+    else if ( i % 4 == 3 ) {
+      result[3] >>= input[i];
+    }
   }
+
   delete [] input;
 
-  return (T)result[0]; 
+  return (T) result[0];
 }
 
-template<class T, class DeviceType>
-bool RSEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool RSEqualAtomicViewTest( const long input_length )
 {
-
-  static_assert( std::is_integral<T>::value, "RSEqualAtomicViewTest: Must be integral type for test");
+  static_assert( std::is_integral< T >::value, "RSEqualAtomicViewTest: Must be integral type for test" );
 
   const long remainder = 61042; //prime - 1
-  const long value =  1073741825; //  2^30+1
-  T res       = RSEqualAtomicView<T,DeviceType>(input_length, value, remainder);
-  T resSerial = RSEqualAtomicViewCheck<T>(input_length, value, remainder);
+  const long value = 1073741825; //  2^30+1
+  T res       = RSEqualAtomicView< T, DeviceType >( input_length, value, remainder );
+  T resSerial = RSEqualAtomicViewCheck< T >( input_length, value, remainder );
 
   bool passed = true;
 
@@ -1023,142 +963,134 @@ bool RSEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = RSEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view ls-equal------------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct LSEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T**** , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
+  typedef Kokkos::View< T****, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
   const long value;
 
-  // Wrap the result view in an atomic view, use this for operator
-  LSEqualAtomicViewFunctor( const view_type & input_ , result_view_type & result_ , const long & length_ , const long & value_ ) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
-    , value(value_)
+  // Wrap the result view in an atomic view, use this for operator.
+  LSEqualAtomicViewFunctor( const view_type & input_, result_view_type & result_, const long & length_, const long & value_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+    , value( value_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 4 == 0 ) {
-        result(1,0,0,0) <<= input(i);
+        result( 1, 0, 0, 0 ) <<= input( i );
       }
       else if ( i % 4 == 1 ) {
-        result(0,1,0,0) <<= input(i);
+        result( 0, 1, 0, 0 ) <<= input( i );
       }
       else if ( i % 4 == 2 ) {
-        result(0,0,1,0) <<= input(i);
+        result( 0, 0, 1, 0 ) <<= input( i );
       }
       else if ( i % 4 == 3 ) {
-        result(0,0,0,1) <<= input(i);
+        result( 0, 0, 0, 1 ) <<= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T LSEqualAtomicView(const long input_length, const long value, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
-  typedef typename result_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T LSEqualAtomicView( const long input_length, const long value, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+  typedef typename result_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  result_view_type result_view("result_view",2,2,2,2) ;
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-    h_result_view(1,0,0,0) = value;
-    h_result_view(0,1,0,0) = value;
-    h_result_view(0,0,1,0) = value;
-    h_result_view(0,0,0,1) = value;
-  Kokkos::deep_copy( result_view , h_result_view );
+  view_type input( "input_view", length );
+  result_view_type result_view( "result_view", 2, 2, 2, 2 );
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+    h_result_view( 1, 0, 0, 0 ) = value;
+    h_result_view( 0, 1, 0, 0 ) = value;
+    h_result_view( 0, 0, 1, 0 ) = value;
+    h_result_view( 0, 0, 0, 1 ) = value;
+  Kokkos::deep_copy( result_view, h_result_view );
 
-  InitFunctor_ModShift<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModShift< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  LSEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length, value);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  LSEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length, value );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  Kokkos::deep_copy(h_result_view, result_view);
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(1,0,0,0)) ; 
+  return (T) ( h_result_view( 1, 0, 0, 0 ) );
 }
 
-template<class T>
+template< class T >
 T LSEqualAtomicViewCheck( const long input_length, const long value, const long remainder ) {
-
-  T result[4] ;
-  result[0] = value ;
-  result[1] = value ;
-  result[2] = value ;
-  result[3] = value ;
+  T result[4];
+  result[0] = value;
+  result[1] = value;
+  result[2] = value;
+  result[3] = value;
 
   T * input = new T[input_length];
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % (remainder+1) == remainder ) {
-        input[i] = 1;
-      }
-      else {
-        input[i] = 0;
-      }
+    if ( i % ( remainder + 1 ) == remainder ) {
+      input[i] = 1;
+    }
+    else {
+      input[i] = 0;
+    }
   }
 
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % 4 == 0 ) {
-        result[0] <<= input[i];
-      }
-      else if ( i % 4 == 1 ) {
-        result[1] <<= input[i];
-      }
-      else if ( i % 4 == 2 ) {
-        result[2] <<= input[i];
-      }
-      else if ( i % 4 == 3 ) {
-        result[3] <<= input[i];
-      }
+    if ( i % 4 == 0 ) {
+      result[0] <<= input[i];
+    }
+    else if ( i % 4 == 1 ) {
+      result[1] <<= input[i];
+    }
+    else if ( i % 4 == 2 ) {
+      result[2] <<= input[i];
+    }
+    else if ( i % 4 == 3 ) {
+      result[3] <<= input[i];
+    }
   }
 
   delete [] input;
 
-  return (T)result[0]; 
+  return (T) result[0];
 }
 
-template<class T, class DeviceType>
-bool LSEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool LSEqualAtomicViewTest( const long input_length )
 {
-
-  static_assert( std::is_integral<T>::value, "LSEqualAtomicViewTest: Must be integral type for test");
+  static_assert( std::is_integral< T >::value, "LSEqualAtomicViewTest: Must be integral type for test" );
 
   const long remainder = 61042; //prime - 1
-  const long value =  1; //  2^30+1
-  T res       = LSEqualAtomicView<T,DeviceType>(input_length, value, remainder);
-  T resSerial = LSEqualAtomicViewCheck<T>(input_length, value, remainder);
+  const long value = 1; //  2^30+1
+  T res       = LSEqualAtomicView< T, DeviceType >( input_length, value, remainder );
+  T resSerial = LSEqualAtomicViewCheck< T >( input_length, value, remainder );
 
   bool passed = true;
 
@@ -1166,104 +1098,96 @@ bool LSEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = RSEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view and-equal-----------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct AndEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  AndEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  AndEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) &= input(i);
+        even_odd_result( 0 ) &= input( i );
       }
       else {
-        even_odd_result(1) &= input(i);
+        even_odd_result( 1 ) &= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T AndEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T AndEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
-  Kokkos::deep_copy(result_view, 1);
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+  Kokkos::deep_copy( result_view, 1 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  AndEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  AndEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T AndEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
-  T result[2] = {1};
+  T result[2] = { 1 };
   for ( long i = 0; i < N; ++i ) {
     if ( N % 2 == 0 ) {
-      result[0] &= (T)i;
+      result[0] &= (T) i;
     }
     else {
-      result[1] &= (T)i;
+      result[1] &= (T) i;
     }
   }
 
-  return (result[0]);
+  return ( result[0] );
 }
 
-template<class T,class DeviceType>
-bool AndEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool AndEqualAtomicViewTest( long input_length )
 {
+  static_assert( std::is_integral< T >::value, "AndEqualAtomicViewTest: Must be integral type for test" );
 
-  static_assert( std::is_integral<T>::value, "AndEqualAtomicViewTest: Must be integral type for test");
-
-  T res       = AndEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = AndEqualAtomicViewCheck<T>(input_length);
+  T res       = AndEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = AndEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -1271,103 +1195,96 @@ bool AndEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = AndEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view or-equal-----------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct OrEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  OrEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  OrEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) |= input(i);
+        even_odd_result( 0 ) |= input( i );
       }
       else {
-        even_odd_result(1) |= input(i);
+        even_odd_result( 1 ) |= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T OrEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T OrEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  OrEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  OrEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T OrEqualAtomicViewCheck( const long input_length ) {
 
   const long N = input_length;
-  T result[2] = {0};
+  T result[2] = { 0 };
   for ( long i = 0; i < N; ++i ) {
     if ( i % 2 == 0 ) {
-      result[0] |= (T)i;
+      result[0] |= (T) i;
     }
     else {
-      result[1] |= (T)i;
+      result[1] |= (T) i;
     }
   }
 
-  return (T)(result[0]);
+  return (T) ( result[0] );
 }
 
-template<class T,class DeviceType>
-bool OrEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool OrEqualAtomicViewTest( long input_length )
 {
-  
-  static_assert( std::is_integral<T>::value, "OrEqualAtomicViewTest: Must be integral type for test");
+  static_assert( std::is_integral< T >::value, "OrEqualAtomicViewTest: Must be integral type for test" );
 
-  T res       = OrEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = OrEqualAtomicViewCheck<T>(input_length);
+  T res       = OrEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = OrEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -1375,103 +1292,95 @@ bool OrEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = OrEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view xor-equal-----------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct XOrEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  XOrEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  XOrEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) ^= input(i);
+        even_odd_result( 0 ) ^= input( i );
       }
       else {
-        even_odd_result(1) ^= input(i);
+        even_odd_result( 1 ) ^= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T XOrEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T XOrEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  XOrEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  XOrEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T XOrEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
-  T result[2] = {0};
+  T result[2] = { 0 };
   for ( long i = 0; i < N; ++i ) {
     if ( i % 2 == 0 ) {
-      result[0] ^= (T)i;
+      result[0] ^= (T) i;
     }
     else {
-      result[1] ^= (T)i;
+      result[1] ^= (T) i;
     }
   }
 
-  return (T)(result[0]);
+  return (T) ( result[0] );
 }
 
-template<class T,class DeviceType>
-bool XOrEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool XOrEqualAtomicViewTest( long input_length )
 {
+  static_assert( std::is_integral< T >::value, "XOrEqualAtomicViewTest: Must be integral type for test" );
 
-  static_assert( std::is_integral<T>::value, "XOrEqualAtomicViewTest: Must be integral type for test");
-
-  T res       = XOrEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = XOrEqualAtomicViewCheck<T>(input_length);
+  T res       = XOrEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = XOrEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -1479,54 +1388,52 @@ bool XOrEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = XOrEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 // inc/dec?
 
-
 //---------------------------------------------------
 //--------------atomic_test_control------------------
 //---------------------------------------------------
 
-template<class T,class DeviceType>
-bool AtomicViewsTestIntegralType( const int length , int test )
+template< class T, class DeviceType >
+bool AtomicViewsTestIntegralType( const int length, int test )
 {
-  static_assert( std::is_integral<T>::value, "TestAtomicViews Error: Non-integral type passed into IntegralType tests");
-
-  switch (test) {
-    case 1: return PlusEqualAtomicViewTest<T,DeviceType>( length );
-    case 2: return MinusEqualAtomicViewTest<T,DeviceType>( length );
-    case 3: return RSEqualAtomicViewTest<T,DeviceType>( length );
-    case 4: return LSEqualAtomicViewTest<T,DeviceType>( length );
-    case 5: return ModEqualAtomicViewTest<T,DeviceType>( length );
-    case 6: return AndEqualAtomicViewTest<T,DeviceType>( length );
-    case 7: return OrEqualAtomicViewTest<T,DeviceType>( length );
-    case 8: return XOrEqualAtomicViewTest<T,DeviceType>( length );
+  static_assert( std::is_integral< T >::value, "TestAtomicViews Error: Non-integral type passed into IntegralType tests" );
+
+  switch ( test ) {
+    case 1: return PlusEqualAtomicViewTest< T, DeviceType >( length );
+    case 2: return MinusEqualAtomicViewTest< T, DeviceType >( length );
+    case 3: return RSEqualAtomicViewTest< T, DeviceType >( length );
+    case 4: return LSEqualAtomicViewTest< T, DeviceType >( length );
+    case 5: return ModEqualAtomicViewTest< T, DeviceType >( length );
+    case 6: return AndEqualAtomicViewTest< T, DeviceType >( length );
+    case 7: return OrEqualAtomicViewTest< T, DeviceType >( length );
+    case 8: return XOrEqualAtomicViewTest< T, DeviceType >( length );
   }
+
   return 0;
 }
 
-
-template<class T,class DeviceType>
-bool AtomicViewsTestNonIntegralType( const int length , int test )
+template< class T, class DeviceType >
+bool AtomicViewsTestNonIntegralType( const int length, int test )
 {
-  switch (test) {
-    case 1: return PlusEqualAtomicViewTest<T,DeviceType>( length );
-    case 2: return MinusEqualAtomicViewTest<T,DeviceType>( length );
-    case 3: return TimesEqualAtomicViewTest<T,DeviceType>( length );
-    case 4: return DivEqualAtomicViewTest<T,DeviceType>( length );
+  switch ( test ) {
+    case 1: return PlusEqualAtomicViewTest< T, DeviceType >( length );
+    case 2: return MinusEqualAtomicViewTest< T, DeviceType >( length );
+    case 3: return TimesEqualAtomicViewTest< T, DeviceType >( length );
+    case 4: return DivEqualAtomicViewTest< T, DeviceType >( length );
   }
+
   return 0;
 }
 
-} // namespace
-
+} // namespace TestAtomicViews
diff --git a/lib/kokkos/core/unit_test/TestCXX11.hpp b/lib/kokkos/core/unit_test/TestCXX11.hpp
index d6dde5e963e1f0706fecd333b56dd9e8ed181d0e..e2ad623d9c89cef44c4e55a9096d3dba6796adf6 100644
--- a/lib/kokkos/core/unit_test/TestCXX11.hpp
+++ b/lib/kokkos/core/unit_test/TestCXX11.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,283 +36,294 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
+
 #include <Kokkos_Core.hpp>
 
 namespace TestCXX11 {
 
-template<class DeviceType>
-struct FunctorAddTest{
-  typedef Kokkos::View<double**,DeviceType> view_type;
-  view_type a_, b_;
+template< class DeviceType >
+struct FunctorAddTest {
+  typedef Kokkos::View< double**, DeviceType > view_type;
   typedef DeviceType execution_space;
-  FunctorAddTest(view_type & a, view_type &b):a_(a),b_(b) {}
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member;
+
+  view_type a_, b_;
+
+  FunctorAddTest( view_type & a, view_type & b ) : a_( a ), b_( b ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-    b_(i,0) = a_(i,1) + a_(i,2);
-    b_(i,1) = a_(i,0) - a_(i,3);
-    b_(i,2) = a_(i,4) + a_(i,0);
-    b_(i,3) = a_(i,2) - a_(i,1);
-    b_(i,4) = a_(i,3) + a_(i,4);
+  void operator() ( const int& i ) const {
+    b_( i, 0 ) = a_( i, 1 ) + a_( i, 2 );
+    b_( i, 1 ) = a_( i, 0 ) - a_( i, 3 );
+    b_( i, 2 ) = a_( i, 4 ) + a_( i, 0 );
+    b_( i, 3 ) = a_( i, 2 ) - a_( i, 1 );
+    b_( i, 4 ) = a_( i, 3 ) + a_( i, 4 );
   }
 
-  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_member & dev) const {
-    const int begin = dev.league_rank() * 4 ;
-    const int end   = begin + 4 ;
-    for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-      b_(i,0) = a_(i,1) + a_(i,2);
-      b_(i,1) = a_(i,0) - a_(i,3);
-      b_(i,2) = a_(i,4) + a_(i,0);
-      b_(i,3) = a_(i,2) - a_(i,1);
-      b_(i,4) = a_(i,3) + a_(i,4);
+  void operator() ( const team_member & dev ) const {
+    const int begin = dev.league_rank() * 4;
+    const int end   = begin + 4;
+    for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+      b_( i, 0 ) = a_( i, 1 ) + a_( i, 2 );
+      b_( i, 1 ) = a_( i, 0 ) - a_( i, 3 );
+      b_( i, 2 ) = a_( i, 4 ) + a_( i, 0 );
+      b_( i, 3 ) = a_( i, 2 ) - a_( i, 1 );
+      b_( i, 4 ) = a_( i, 3 ) + a_( i, 4 );
     }
   }
 };
 
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double AddTestFunctor() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
 
-  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-
-  Kokkos::View<double**,DeviceType> a("A",100,5);
-  Kokkos::View<double**,DeviceType> b("B",100,5);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
+  Kokkos::View< double**, DeviceType > a( "A", 100, 5 );
+  Kokkos::View< double**, DeviceType > b( "B", 100, 5 );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_a = Kokkos::create_mirror_view( a );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_b = Kokkos::create_mirror_view( b );
 
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  for ( int i = 0; i < 100; i++ ) {
+    for  ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy( a, h_a );
 
-  if(PWRTest==false)
-    Kokkos::parallel_for(100,FunctorAddTest<DeviceType>(a,b));
-  else
-    Kokkos::parallel_for(policy_type(25,Kokkos::AUTO),FunctorAddTest<DeviceType>(a,b));
-  Kokkos::deep_copy(h_b,b);
+  if ( PWRTest == false ) {
+    Kokkos::parallel_for( 100, FunctorAddTest< DeviceType >( a, b ) );
+  }
+  else {
+    Kokkos::parallel_for( policy_type( 25, Kokkos::AUTO ), FunctorAddTest< DeviceType >( a, b ) );
+  }
+  Kokkos::deep_copy( h_b, b );
 
   double result = 0;
-  for(int i=0;i<100;i++) {
-      for(int j=0;j<5;j++)
-         result += h_b(i,j);
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+      result += h_b( i, j );
     }
+  }
 
   return result;
 }
 
-
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-template<class DeviceType, bool PWRTest>
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+template< class DeviceType, bool PWRTest >
 double AddTestLambda() {
-
-  Kokkos::View<double**,DeviceType> a("A",100,5);
-  Kokkos::View<double**,DeviceType> b("B",100,5);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
-
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  Kokkos::View< double**, DeviceType > a( "A", 100, 5 );
+  Kokkos::View< double**, DeviceType > b( "B", 100, 5 );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_a = Kokkos::create_mirror_view( a );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_b = Kokkos::create_mirror_view( b );
+
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
-
-  if(PWRTest==false) {
-    Kokkos::parallel_for(100,KOKKOS_LAMBDA(const int& i)  {
-      b(i,0) = a(i,1) + a(i,2);
-      b(i,1) = a(i,0) - a(i,3);
-      b(i,2) = a(i,4) + a(i,0);
-      b(i,3) = a(i,2) - a(i,1);
-      b(i,4) = a(i,3) + a(i,4);
+  Kokkos::deep_copy( a, h_a );
+
+  if ( PWRTest == false ) {
+    Kokkos::parallel_for( 100, KOKKOS_LAMBDA( const int & i ) {
+      b( i, 0 ) = a( i, 1 ) + a( i, 2 );
+      b( i, 1 ) = a( i, 0 ) - a( i, 3 );
+      b( i, 2 ) = a( i, 4 ) + a( i, 0 );
+      b( i, 3 ) = a( i, 2 ) - a( i, 1 );
+      b( i, 4 ) = a( i, 3 ) + a( i, 4 );
     });
-  } else {
-    typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-    typedef typename policy_type::member_type team_member ;
-
-    policy_type policy(25,Kokkos::AUTO);
-
-    Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const team_member & dev)  {
-      const int begin = dev.league_rank() * 4 ;
-      const int end   = begin + 4 ;
-      for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-        b(i,0) = a(i,1) + a(i,2);
-        b(i,1) = a(i,0) - a(i,3);
-        b(i,2) = a(i,4) + a(i,0);
-        b(i,3) = a(i,2) - a(i,1);
-        b(i,4) = a(i,3) + a(i,4);
+  }
+  else {
+    typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+    typedef typename policy_type::member_type team_member;
+
+    policy_type policy( 25, Kokkos::AUTO );
+
+    Kokkos::parallel_for( policy, KOKKOS_LAMBDA( const team_member & dev ) {
+      const int begin = dev.league_rank() * 4;
+      const int end   = begin + 4;
+      for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+        b( i, 0 ) = a( i, 1 ) + a( i, 2 );
+        b( i, 1 ) = a( i, 0 ) - a( i, 3 );
+        b( i, 2 ) = a( i, 4 ) + a( i, 0 );
+        b( i, 3 ) = a( i, 2 ) - a( i, 1 );
+        b( i, 4 ) = a( i, 3 ) + a( i, 4 );
       }
     });
   }
-  Kokkos::deep_copy(h_b,b);
+  Kokkos::deep_copy( h_b, b );
 
   double result = 0;
-  for(int i=0;i<100;i++) {
-      for(int j=0;j<5;j++)
-         result += h_b(i,j);
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+      result += h_b( i, j );
     }
+  }
 
   return result;
 }
-
 #else
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double AddTestLambda() {
-  return AddTestFunctor<DeviceType,PWRTest>();
+  return AddTestFunctor< DeviceType, PWRTest >();
 }
 #endif
 
-
-template<class DeviceType>
-struct FunctorReduceTest{
-  typedef Kokkos::View<double**,DeviceType> view_type;
-  view_type a_;
+template< class DeviceType >
+struct FunctorReduceTest {
+  typedef Kokkos::View< double**, DeviceType > view_type;
   typedef DeviceType execution_space;
   typedef double value_type;
-  FunctorReduceTest(view_type & a):a_(a) {}
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member;
+
+  view_type a_;
+
+  FunctorReduceTest( view_type & a ) : a_( a ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, value_type& sum) const {
-    sum += a_(i,1) + a_(i,2);
-    sum += a_(i,0) - a_(i,3);
-    sum += a_(i,4) + a_(i,0);
-    sum += a_(i,2) - a_(i,1);
-    sum += a_(i,3) + a_(i,4);
+  void operator() ( const int & i, value_type & sum ) const {
+    sum += a_( i, 1 ) + a_( i, 2 );
+    sum += a_( i, 0 ) - a_( i, 3 );
+    sum += a_( i, 4 ) + a_( i, 0 );
+    sum += a_( i, 2 ) - a_( i, 1 );
+    sum += a_( i, 3 ) + a_( i, 4 );
   }
 
-  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
-
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_member & dev, value_type& sum) const {
-    const int begin = dev.league_rank() * 4 ;
-    const int end   = begin + 4 ;
-    for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-      sum += a_(i,1) + a_(i,2);
-      sum += a_(i,0) - a_(i,3);
-      sum += a_(i,4) + a_(i,0);
-      sum += a_(i,2) - a_(i,1);
-      sum += a_(i,3) + a_(i,4);
+  void operator() ( const team_member & dev, value_type & sum ) const {
+    const int begin = dev.league_rank() * 4;
+    const int end   = begin + 4;
+    for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+      sum += a_( i, 1 ) + a_( i, 2 );
+      sum += a_( i, 0 ) - a_( i, 3 );
+      sum += a_( i, 4 ) + a_( i, 0 );
+      sum += a_( i, 2 ) - a_( i, 1 );
+      sum += a_( i, 3 ) + a_( i, 4 );
     }
   }
+
   KOKKOS_INLINE_FUNCTION
-  void init(value_type& update) const {update = 0.0;}
+  void init( value_type & update ) const { update = 0.0; }
+
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& update, volatile value_type const& input) const {update += input;}
+  void join( volatile value_type & update, volatile value_type const & input ) const { update += input; }
 };
 
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double ReduceTestFunctor() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef Kokkos::View< double, typename view_type::host_mirror_space, Kokkos::MemoryUnmanaged > unmanaged_result;
 
-  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-  typedef Kokkos::View<double**,DeviceType> view_type ;
-  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
-
-  view_type a("A",100,5);
-  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+  view_type a( "A", 100, 5 );
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view( a );
 
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy( a, h_a );
 
   double result = 0.0;
-  if(PWRTest==false)
-    Kokkos::parallel_reduce(100,FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
-  else
-    Kokkos::parallel_reduce(policy_type(25,Kokkos::AUTO),FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
+  if ( PWRTest == false ) {
+    Kokkos::parallel_reduce( 100, FunctorReduceTest< DeviceType >( a ), unmanaged_result( & result ) );
+  }
+  else {
+    Kokkos::parallel_reduce( policy_type( 25, Kokkos::AUTO ), FunctorReduceTest< DeviceType >( a ), unmanaged_result( & result ) );
+  }
 
   return result;
 }
 
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-template<class DeviceType, bool PWRTest>
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+template< class DeviceType, bool PWRTest >
 double ReduceTestLambda() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef Kokkos::View< double, typename view_type::host_mirror_space, Kokkos::MemoryUnmanaged > unmanaged_result;
 
-  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-  typedef Kokkos::View<double**,DeviceType> view_type ;
-  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
-
-  view_type a("A",100,5);
-  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+  view_type a( "A", 100, 5 );
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view( a );
 
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy( a, h_a );
 
   double result = 0.0;
 
-  if(PWRTest==false) {
-    Kokkos::parallel_reduce(100,KOKKOS_LAMBDA(const int& i, double& sum)  {
-      sum += a(i,1) + a(i,2);
-      sum += a(i,0) - a(i,3);
-      sum += a(i,4) + a(i,0);
-      sum += a(i,2) - a(i,1);
-      sum += a(i,3) + a(i,4);
+  if ( PWRTest == false ) {
+    Kokkos::parallel_reduce( 100, KOKKOS_LAMBDA( const int & i, double & sum ) {
+      sum += a( i, 1 ) + a( i, 2 );
+      sum += a( i, 0 ) - a( i, 3 );
+      sum += a( i, 4 ) + a( i, 0 );
+      sum += a( i, 2 ) - a( i, 1 );
+      sum += a( i, 3 ) + a( i, 4 );
     }, unmanaged_result( & result ) );
-  } else {
-    typedef typename policy_type::member_type team_member ;
-    Kokkos::parallel_reduce(policy_type(25,Kokkos::AUTO),KOKKOS_LAMBDA(const team_member & dev, double& sum)  {
-      const int begin = dev.league_rank() * 4 ;
-      const int end   = begin + 4 ;
-      for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-        sum += a(i,1) + a(i,2);
-        sum += a(i,0) - a(i,3);
-        sum += a(i,4) + a(i,0);
-        sum += a(i,2) - a(i,1);
-        sum += a(i,3) + a(i,4);
+  }
+  else {
+    typedef typename policy_type::member_type team_member;
+    Kokkos::parallel_reduce( policy_type( 25, Kokkos::AUTO ), KOKKOS_LAMBDA( const team_member & dev, double & sum ) {
+      const int begin = dev.league_rank() * 4;
+      const int end   = begin + 4;
+      for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+        sum += a( i, 1 ) + a( i, 2 );
+        sum += a( i, 0 ) - a( i, 3 );
+        sum += a( i, 4 ) + a( i, 0 );
+        sum += a( i, 2 ) - a( i, 1 );
+        sum += a( i, 3 ) + a( i, 4 );
       }
     }, unmanaged_result( & result ) );
   }
 
   return result;
 }
-
 #else
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double ReduceTestLambda() {
-  return ReduceTestFunctor<DeviceType,PWRTest>();
+  return ReduceTestFunctor< DeviceType, PWRTest >();
 }
 #endif
 
-template<class DeviceType>
-double TestVariantLambda(int test) {
-  switch (test) {
-    case 1: return AddTestLambda<DeviceType,false>();
-    case 2: return AddTestLambda<DeviceType,true>();
-    case 3: return ReduceTestLambda<DeviceType,false>();
-    case 4: return ReduceTestLambda<DeviceType,true>();
+template< class DeviceType >
+double TestVariantLambda( int test ) {
+  switch ( test ) {
+    case 1: return AddTestLambda< DeviceType, false >();
+    case 2: return AddTestLambda< DeviceType, true >();
+    case 3: return ReduceTestLambda< DeviceType, false >();
+    case 4: return ReduceTestLambda< DeviceType, true >();
   }
+
   return 0;
 }
 
-
-template<class DeviceType>
-double TestVariantFunctor(int test) {
-  switch (test) {
-    case 1: return AddTestFunctor<DeviceType,false>();
-    case 2: return AddTestFunctor<DeviceType,true>();
-    case 3: return ReduceTestFunctor<DeviceType,false>();
-    case 4: return ReduceTestFunctor<DeviceType,true>();
+template< class DeviceType >
+double TestVariantFunctor( int test ) {
+  switch ( test ) {
+    case 1: return AddTestFunctor< DeviceType, false >();
+    case 2: return AddTestFunctor< DeviceType, true >();
+    case 3: return ReduceTestFunctor< DeviceType, false >();
+    case 4: return ReduceTestFunctor< DeviceType, true >();
   }
+
   return 0;
 }
 
-template<class DeviceType>
-bool Test(int test) {
-
+template< class DeviceType >
+bool Test( int test ) {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  double res_functor = TestVariantFunctor<DeviceType>(test);
-  double res_lambda = TestVariantLambda<DeviceType>(test);
+  double res_functor = TestVariantFunctor< DeviceType >( test );
+  double res_lambda = TestVariantLambda< DeviceType >( test );
 
-  char testnames[5][256] = {" "
-                            ,"AddTest","AddTest TeamPolicy"
-                            ,"ReduceTest","ReduceTest TeamPolicy"
+  char testnames[5][256] = { " "
+                           , "AddTest", "AddTest TeamPolicy"
+                           , "ReduceTest", "ReduceTest TeamPolicy"
                            };
   bool passed = true;
 
@@ -322,13 +333,13 @@ bool Test(int test) {
     std::cout << "CXX11 ( test = '"
               << testnames[test] << "' FAILED : "
               << res_functor << " != " << res_lambda
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 #else
   return true;
 #endif
 }
 
-}
+} // namespace TestCXX11
diff --git a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
index 359e17a44f1642d630b97987f8d049fc3217a9fb..b53b42b8e05bc906c17f2ad59bdf1ebb9fd62ef7 100644
--- a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
+++ b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,10 +36,11 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
+
 #include <Kokkos_Core.hpp>
 
 #ifndef TESTCXX11DEDUCTION_HPP
@@ -52,43 +53,40 @@ struct TestReductionDeductionTagB {};
 
 template < class ExecSpace >
 struct TestReductionDeductionFunctor {
-
   // KOKKOS_INLINE_FUNCTION
-  // void operator()( long i , long & value ) const
-  // { value += i + 1 ; }
+  // void operator()( long i, long & value ) const
+  // { value += i + 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( TestReductionDeductionTagA , long i , long & value ) const
+  void operator()( TestReductionDeductionTagA, long i, long & value ) const
   { value += ( 2 * i + 1 ) + ( 2 * i + 2 ); }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TestReductionDeductionTagB & , const long i , long & value ) const
-  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ) ; }
-
+  void operator()( const TestReductionDeductionTagB &, const long i, long & value ) const
+  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ); }
 };
 
 template< class ExecSpace >
 void test_reduction_deduction()
 {
-  typedef TestReductionDeductionFunctor< ExecSpace > Functor ;
+  typedef TestReductionDeductionFunctor< ExecSpace > Functor;
 
-  const long N = 50 ;
-  // const long answer  = N % 2 ? ( N * ((N+1)/2 )) : ( (N/2) * (N+1) );
-  const long answerA = N % 2 ? ( (2*N) * (((2*N)+1)/2 )) : ( ((2*N)/2) * ((2*N)+1) );
-  const long answerB = N % 2 ? ( (3*N) * (((3*N)+1)/2 )) : ( ((3*N)/2) * ((3*N)+1) );
-  long result = 0 ;
+  const long N = 50;
+  // const long answer  = N % 2 ? ( N * ( ( N + 1 ) / 2 ) ) : ( ( N / 2 ) * ( N + 1 ) );
+  const long answerA = N % 2 ? ( ( 2 * N ) * ( ( ( 2 * N ) + 1 ) / 2 ) ) : ( ( ( 2 * N ) / 2 ) * ( ( 2 * N ) + 1 ) );
+  const long answerB = N % 2 ? ( ( 3 * N ) * ( ( ( 3 * N ) + 1 ) / 2 ) ) : ( ( ( 3 * N ) / 2 ) * ( ( 3 * N ) + 1 ) );
+  long result = 0;
 
-  // Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace>(0,N) , Functor() , result );
-  // ASSERT_EQ( answer , result );
-  
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagA>(0,N) , Functor() , result );
-  ASSERT_EQ( answerA , result );
-  
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagB>(0,N) , Functor() , result );
-  ASSERT_EQ( answerB , result );
-}
+  // Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), Functor(), result );
+  // ASSERT_EQ( answer, result );
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TestReductionDeductionTagA >( 0, N ), Functor(), result );
+  ASSERT_EQ( answerA, result );
 
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TestReductionDeductionTagB >( 0, N ), Functor(), result );
+  ASSERT_EQ( answerB, result );
 }
 
-#endif
+} // namespace TestCXX11
 
+#endif
diff --git a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
index 5add656a4d7aaa7b70bc247a9ed3af1599e27211..45554383446ec13794f9e22bb0819477a7bdb278 100644
--- a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
+++ b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -47,17 +47,17 @@
 
 namespace TestCompilerMacros {
 
-template<class DEVICE_TYPE>
+template< class DEVICE_TYPE >
 struct AddFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<int**,execution_space> type;
-  type a,b;
+  typedef typename Kokkos::View< int**, execution_space > type;
+  type a, b;
   int length;
 
-  AddFunctor(type a_, type b_):a(a_),b(b_),length(a.dimension_1()) {}
+  AddFunctor( type a_, type b_ ) : a( a_ ), b( b_ ), length( a.dimension_1() ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()( int i ) const {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
     #pragma unroll
 #endif
@@ -75,21 +75,23 @@ struct AddFunctor {
     #pragma simd
 #endif
 #endif
-    for(int j=0;j<length;j++)
-      a(i,j) += b(i,j);
+    for ( int j = 0; j < length; j++ ) {
+      a( i, j ) += b( i, j );
+    }
   }
 };
 
-template<class DeviceType>
+template< class DeviceType >
 bool Test() {
-  typedef typename Kokkos::View<int**,DeviceType> type;
-  type a("A",1024,128);
-  type b("B",1024,128);
+  typedef typename Kokkos::View< int**, DeviceType > type;
+  type a( "A", 1024, 128 );
+  type b( "B", 1024, 128 );
 
-  AddFunctor<DeviceType> f(a,b);
-  Kokkos::parallel_for(1024,f);
+  AddFunctor< DeviceType > f( a, b );
+  Kokkos::parallel_for( 1024, f );
   DeviceType::fence();
+
   return true;
 }
 
-}
+} // namespace TestCompilerMacros
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
index 7e08f67e69721dc803f1ea4a23cbe3328af391dc..f85a35c096516fe77c39cfaaa1778a9d5bb895ef 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
@@ -45,13 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestAtomic.hpp>
-
 #include <TestViewAPI.hpp>
-
 #include <TestReduce.hpp>
 #include <TestScan.hpp>
 #include <TestTeam.hpp>
@@ -78,24 +75,25 @@ protected:
 
 TEST_F( defaultdevicetype, host_space_access )
 {
-  typedef Kokkos::HostSpace::execution_space host_exec_space ;
-  typedef Kokkos::Device< host_exec_space , Kokkos::HostSpace > device_space ;
-  typedef Kokkos::Impl::HostMirror< Kokkos::DefaultExecutionSpace >::Space mirror_space ;
+  typedef Kokkos::HostSpace::execution_space host_exec_space;
+  typedef Kokkos::Device< host_exec_space, Kokkos::HostSpace > device_space;
+  typedef Kokkos::Impl::HostMirror< Kokkos::DefaultExecutionSpace >::Space mirror_space;
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< host_exec_space , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< host_exec_space, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< device_space , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< device_space, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< mirror_space , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< mirror_space, Kokkos::HostSpace >::accessible, "" );
 }
 
-TEST_F( defaultdevicetype, view_api) {
-  TestViewAPI< double , Kokkos::DefaultExecutionSpace >();
+TEST_F( defaultdevicetype, view_api )
+{
+  TestViewAPI< double, Kokkos::DefaultExecutionSpace >();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 7778efde301bb9fd8856c9743bfcaaea2d7b3095..401da58a5838d7cab5adaf38a00d4231f51721d2 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -44,376 +44,425 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
+
 #ifdef KOKKOS_ENABLE_OPENMP
 #include <omp.h>
 #endif
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 namespace Test {
 
 namespace Impl {
 
-  char** init_kokkos_args(bool do_threads,bool do_numa,bool do_device,bool do_other, int& nargs, Kokkos::InitArguments& init_args) {
-    nargs = (do_threads?1:0) +
-            (do_numa?1:0) +
-            (do_device?1:0) +
-            (do_other?4:0);
-    char** args_kokkos = new char*[nargs];
-    for(int i = 0; i < nargs; i++)
-      args_kokkos[i] = new char[20];
+char** init_kokkos_args( bool do_threads, bool do_numa, bool do_device, bool do_other, int & nargs, Kokkos::InitArguments & init_args ) {
+  nargs = ( do_threads ? 1 : 0 ) +
+          ( do_numa ? 1 : 0 ) +
+          ( do_device ? 1 : 0 ) +
+          ( do_other ? 4 : 0 );
 
-    int threads_idx = do_other?1:0;
-    int numa_idx = (do_other?3:0) + (do_threads?1:0);
-    int device_idx = (do_other?3:0) + (do_threads?1:0) + (do_numa?1:0);
+  char** args_kokkos = new char*[nargs];
+  for ( int i = 0; i < nargs; i++ ) {
+    args_kokkos[i] = new char[20];
+  }
 
+  int threads_idx = do_other ? 1 : 0;
+  int numa_idx = ( do_other ? 3 : 0 ) + ( do_threads ? 1 : 0 );
+  int device_idx = ( do_other ? 3 : 0 ) + ( do_threads ? 1 : 0 ) + ( do_numa ? 1 : 0 );
 
-    if(do_threads) {
-      int nthreads = 3;
+  if ( do_threads ) {
+    int nthreads = 3;
 
 #ifdef KOKKOS_ENABLE_OPENMP
-      if(omp_get_max_threads() < 3)
-        nthreads = omp_get_max_threads();
+    if ( omp_get_max_threads() < 3 )
+      nthreads = omp_get_max_threads();
 #endif
 
-      if(Kokkos::hwloc::available())  {
-        if(Kokkos::hwloc::get_available_threads_per_core()<3)
-            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
-                       * Kokkos::hwloc::get_available_numa_count();
-      }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        nthreads = 1;
-      }
-#endif
-      init_args.num_threads = nthreads;
-      sprintf(args_kokkos[threads_idx],"--threads=%i",nthreads);
+    if ( Kokkos::hwloc::available() ) {
+      if ( Kokkos::hwloc::get_available_threads_per_core() < 3 )
+        nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                   * Kokkos::hwloc::get_available_numa_count();
     }
 
-    if(do_numa) {
-      int numa = 1;
-      if(Kokkos::hwloc::available())
-        numa = Kokkos::hwloc::get_available_numa_count();
 #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        numa = 1;
-      }
-#endif
-
-      init_args.num_numa = numa;
-      sprintf(args_kokkos[numa_idx],"--numa=%i",numa);
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      nthreads = 1;
     }
+#endif
 
-    if(do_device) {
+    init_args.num_threads = nthreads;
+    sprintf( args_kokkos[threads_idx], "--threads=%i", nthreads );
+  }
 
-      init_args.device_id = 0;
-      sprintf(args_kokkos[device_idx],"--device=%i",0);
+  if ( do_numa ) {
+    int numa = 1;
+    if ( Kokkos::hwloc::available() ) {
+      numa = Kokkos::hwloc::get_available_numa_count();
     }
 
-    if(do_other) {
-      sprintf(args_kokkos[0],"--dummyarg=1");
-      sprintf(args_kokkos[threads_idx+(do_threads?1:0)],"--dummy2arg");
-      sprintf(args_kokkos[threads_idx+(do_threads?1:0)+1],"dummy3arg");
-      sprintf(args_kokkos[device_idx+(do_device?1:0)],"dummy4arg=1");
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      numa = 1;
     }
+#endif
 
+    init_args.num_numa = numa;
+    sprintf( args_kokkos[numa_idx], "--numa=%i", numa );
+  }
 
-    return args_kokkos;
+  if ( do_device ) {
+    init_args.device_id = 0;
+    sprintf( args_kokkos[device_idx], "--device=%i", 0 );
   }
 
-  Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, bool do_device) {
-    Kokkos::InitArguments args;
+  if ( do_other ) {
+    sprintf( args_kokkos[0], "--dummyarg=1" );
+    sprintf( args_kokkos[ threads_idx + ( do_threads ? 1 : 0 ) ], "--dummy2arg" );
+    sprintf( args_kokkos[ threads_idx + ( do_threads ? 1 : 0 ) + 1 ], "dummy3arg" );
+    sprintf( args_kokkos[ device_idx + ( do_device ? 1 : 0 ) ], "dummy4arg=1" );
+  }
+
+  return args_kokkos;
+}
+
+Kokkos::InitArguments init_initstruct( bool do_threads, bool do_numa, bool do_device ) {
+  Kokkos::InitArguments args;
 
-    if(do_threads) {
-      int nthreads = 3;
+  if ( do_threads ) {
+    int nthreads = 3;
 
 #ifdef KOKKOS_ENABLE_OPENMP
-      if(omp_get_max_threads() < 3)
-        nthreads = omp_get_max_threads();
+    if ( omp_get_max_threads() < 3 ) {
+      nthreads = omp_get_max_threads();
+    }
 #endif
 
-      if(Kokkos::hwloc::available())  {
-        if(Kokkos::hwloc::get_available_threads_per_core()<3)
-            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
-                       * Kokkos::hwloc::get_available_numa_count();
+    if ( Kokkos::hwloc::available() ) {
+      if ( Kokkos::hwloc::get_available_threads_per_core() < 3 ) {
+        nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                   * Kokkos::hwloc::get_available_numa_count();
       }
+    }
+
 #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        nthreads = 1;
-      }
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      nthreads = 1;
+    }
 #endif
 
-      args.num_threads = nthreads;
+    args.num_threads = nthreads;
+  }
+
+  if ( do_numa ) {
+    int numa = 1;
+    if ( Kokkos::hwloc::available() ) {
+      numa = Kokkos::hwloc::get_available_numa_count();
     }
 
-    if(do_numa) {
-      int numa = 1;
-      if(Kokkos::hwloc::available())
-        numa = Kokkos::hwloc::get_available_numa_count();
 #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        numa = 1;
-      }
-#endif
-      args.num_numa = numa;
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      numa = 1;
     }
+#endif
 
-    if(do_device) {
-      args.device_id = 0;
-    }
+    args.num_numa = numa;
+  }
 
-    return args;
+  if ( do_device ) {
+    args.device_id = 0;
   }
 
-  void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
-    ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1);
-    ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1);
-
-    //Figure out the number of threads the HostSpace ExecutionSpace should have initialized to
-    int expected_nthreads = argstruct.num_threads;
-    if(expected_nthreads<1) {
-      if(Kokkos::hwloc::available()) {
-        expected_nthreads = Kokkos::hwloc::get_available_numa_count()
-                          * Kokkos::hwloc::get_available_cores_per_numa()
-                          * Kokkos::hwloc::get_available_threads_per_core();
-      } else {
-        #ifdef KOKKOS_ENABLE_OPENMP
-        if(std::is_same<Kokkos::HostSpace::execution_space,Kokkos::OpenMP>::value) {
-          expected_nthreads = omp_get_max_threads();
-        } else
-        #endif
-          expected_nthreads = 1;
+  return args;
+}
+
+void check_correct_initialization( const Kokkos::InitArguments & argstruct ) {
+  ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1 );
+  ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1 );
+
+  // Figure out the number of threads the HostSpace ExecutionSpace should have initialized to.
+  int expected_nthreads = argstruct.num_threads;
 
+  if ( expected_nthreads < 1 ) {
+    if ( Kokkos::hwloc::available() ) {
+      expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                        * Kokkos::hwloc::get_available_cores_per_numa()
+                        * Kokkos::hwloc::get_available_threads_per_core();
+    }
+    else {
+#ifdef KOKKOS_ENABLE_OPENMP
+      if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) {
+        expected_nthreads = omp_get_max_threads();
       }
-      #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
-         std::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value ) 
+      else
+#endif
         expected_nthreads = 1;
-      #endif
     }
 
-    int expected_numa = argstruct.num_numa;
-    if(expected_numa<1) {
-      if(Kokkos::hwloc::available()) {
-        expected_numa = Kokkos::hwloc::get_available_numa_count();
-      } else {
-        expected_numa = 1;
-      }
-      #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
-         std::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value )
-        expected_numa = 1;
-      #endif
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ||
+         std::is_same< Kokkos::DefaultHostExecutionSpace, Kokkos::Serial >::value ) {
+      expected_nthreads = 1;
     }
-    ASSERT_EQ(Kokkos::HostSpace::execution_space::thread_pool_size(),expected_nthreads);
+#endif
+  }
 
-#ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Cuda>::value) {
-      int device;
-      cudaGetDevice( &device );
-      int expected_device = argstruct.device_id;
-      if(argstruct.device_id<0) {
-        expected_device = 0;
-      }
-      ASSERT_EQ(expected_device,device);
+  int expected_numa = argstruct.num_numa;
+
+  if ( expected_numa < 1 ) {
+    if ( Kokkos::hwloc::available() ) {
+      expected_numa = Kokkos::hwloc::get_available_numa_count();
+    }
+    else {
+      expected_numa = 1;
     }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ||
+         std::is_same< Kokkos::DefaultHostExecutionSpace, Kokkos::Serial >::value )
+      expected_numa = 1;
 #endif
   }
 
-  //ToDo: Add check whether correct number of threads are actually started
-  void test_no_arguments() {
-    Kokkos::initialize();
-    check_correct_initialization(Kokkos::InitArguments());
-    Kokkos::finalize();
-  }
+  ASSERT_EQ( Kokkos::HostSpace::execution_space::thread_pool_size(), expected_nthreads );
 
-  void test_commandline_args(int nargs, char** args, const Kokkos::InitArguments& argstruct) {
-    Kokkos::initialize(nargs,args);
-    check_correct_initialization(argstruct);
-    Kokkos::finalize();
-  }
 
-  void test_initstruct_args(const Kokkos::InitArguments& args) {
-    Kokkos::initialize(args);
-    check_correct_initialization(args);
-    Kokkos::finalize();
+#ifdef KOKKOS_ENABLE_CUDA
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Cuda >::value ) {
+    int device;
+    cudaGetDevice( &device );
+
+    int expected_device = argstruct.device_id;
+    if ( argstruct.device_id < 0 ) {
+      expected_device = 0;
+    }
+
+    ASSERT_EQ( expected_device, device );
   }
+#endif
+}
+
+// TODO: Add check whether correct number of threads are actually started.
+void test_no_arguments() {
+  Kokkos::initialize();
+  check_correct_initialization( Kokkos::InitArguments() );
+  Kokkos::finalize();
 }
 
+void test_commandline_args( int nargs, char** args, const Kokkos::InitArguments & argstruct ) {
+  Kokkos::initialize( nargs, args );
+  check_correct_initialization( argstruct );
+  Kokkos::finalize();
+}
+
+void test_initstruct_args( const Kokkos::InitArguments & args ) {
+  Kokkos::initialize( args );
+  check_correct_initialization( args );
+  Kokkos::finalize();
+}
+
+} // namespace Impl
+
 class defaultdevicetypeinit : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-  }
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
-TEST_F( defaultdevicetypeinit, no_args) {
+TEST_F( defaultdevicetypeinit, no_args )
+{
   Impl::test_no_arguments();
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
-TEST_F( defaultdevicetypeinit, commandline_args_empty) {
+TEST_F( defaultdevicetypeinit, commandline_args_empty )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,false,false,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, false, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
-TEST_F( defaultdevicetypeinit, commandline_args_other) {
+TEST_F( defaultdevicetypeinit, commandline_args_other )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,false,false,true,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, false, false, true, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,false,false,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, false, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,true,false,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, true, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,true,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, true, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,false,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, false, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
-TEST_F( defaultdevicetypeinit, commandline_args_numa_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_numa_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,true,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, true, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
-TEST_F( defaultdevicetypeinit, commandline_args_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,false,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, false, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,true,true,true,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, true, true, true, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
-TEST_F( defaultdevicetypeinit, initstruct_default) {
+TEST_F( defaultdevicetypeinit, initstruct_default )
+{
   Kokkos::InitArguments args;
-  Impl::test_initstruct_args(args);
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
-TEST_F( defaultdevicetypeinit, initstruct_nthreads) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,false,false);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, false, false );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
-TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,true,false);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, true, false );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
-TEST_F( defaultdevicetypeinit, initstruct_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(false,false,true);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( false, false, true );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
-TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,false,true);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, false, true );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
-TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,true,true);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, true, true );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
-
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
index dd148a062446f253bbcbc854b775eefd85debf79..4fdfa959107becae384ffa5c5e09d444e9299670 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,12 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestReduce.hpp>
 
-
 namespace Test {
 
 class defaultdevicetype : public ::testing::Test {
@@ -66,11 +64,11 @@ protected:
   }
 };
 
-
-TEST_F( defaultdevicetype, reduce_instantiation_a) {
+TEST_F( defaultdevicetype, reduce_instantiation_a )
+{
   TestReduceCombinatoricalInstantiation<>::execute_a();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp
index c8edfdd5c39d575400408e8dbf5fb3cdd2005d66..841f34e03dd1f9900d304a8f6e889a5d30dc2a65 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,12 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestReduce.hpp>
 
-
 namespace Test {
 
 class defaultdevicetype : public ::testing::Test {
@@ -66,11 +64,11 @@ protected:
   }
 };
 
-
-TEST_F( defaultdevicetype, reduce_instantiation_b) {
+TEST_F( defaultdevicetype, reduce_instantiation_b )
+{
   TestReduceCombinatoricalInstantiation<>::execute_b();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp
index 405d49a9b891619f3d823a5559e7751b8f3b885b..602863be3852a603d6c8e803752ad4a67709c0d5 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,12 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestReduce.hpp>
 
-
 namespace Test {
 
 class defaultdevicetype : public ::testing::Test {
@@ -66,11 +64,11 @@ protected:
   }
 };
 
-
-TEST_F( defaultdevicetype, reduce_instantiation_c) {
+TEST_F( defaultdevicetype, reduce_instantiation_c )
+{
   TestReduceCombinatoricalInstantiation<>::execute_c();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp
index 426cc4f06c6157d37db40ea2feeceac242710ea0..5d3665b905434d1310dc51e430940b17690baac1 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp
@@ -45,13 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestAtomic.hpp>
-
 #include <TestViewAPI.hpp>
-
 #include <TestReduce.hpp>
 #include <TestScan.hpp>
 #include <TestTeam.hpp>
@@ -76,162 +73,165 @@ protected:
   }
 };
 
-TEST_F( defaultdevicetype, test_utilities) {
+TEST_F( defaultdevicetype, test_utilities )
+{
   test_utilities();
 }
 
-TEST_F( defaultdevicetype, long_reduce) {
-  TestReduce< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, long_reduce )
+{
+  TestReduce< long, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, double_reduce) {
-  TestReduce< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, double_reduce )
+{
+  TestReduce< double, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-
-TEST_F( defaultdevicetype , atomics )
+TEST_F( defaultdevicetype, atomics )
 {
-  const int loop_count = 1e4 ;
+  const int loop_count = 1e4;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::DefaultExecutionSpace >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::DefaultExecutionSpace >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::DefaultExecutionSpace >( 100, 3 ) ) );
 }
 
-/*TEST_F( defaultdevicetype , view_remap )
+/*TEST_F( defaultdevicetype, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::DefaultExecutionSpace > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::DefaultExecutionSpace > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::DefaultExecutionSpace > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
-}*/
-
-//----------------------------------------------------------------------------
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::DefaultExecutionSpace > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::DefaultExecutionSpace > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::DefaultExecutionSpace > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+  for ( size_t i3 = 0; i3 < N3; ++i3 ) {
+    for ( size_t i2 = 0; i2 < N2; ++i2 ) {
+      for ( size_t i1 = 0; i1 < N1; ++i1 ) {
+        for ( size_t i0 = 0; i0 < N0; ++i0 ) {
+          input( i0, i1, i2, i3 ) = ++value;
+        }
+      }
+    }
+  }
 
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+  for ( size_t i3 = 0; i3 < N3; ++i3 ) {
+    for ( size_t i2 = 0; i2 < N2; ++i2 ) {
+      for ( size_t i1 = 0; i1 < N1; ++i1 ) {
+        for ( size_t i0 = 0; i0 < N0; ++i0 ) {
+          ++value;
+          ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+        }
+      }
+    }
+  }
+}*/
 
-TEST_F( defaultdevicetype , view_aggregate )
+TEST_F( defaultdevicetype, view_aggregate )
 {
   TestViewAggregate< Kokkos::DefaultExecutionSpace >();
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( defaultdevicetype , scan )
+TEST_F( defaultdevicetype, scan )
 {
-  TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1 , 1000 );
+  TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1, 1000 );
   TestScan< Kokkos::DefaultExecutionSpace >( 1000000 );
   TestScan< Kokkos::DefaultExecutionSpace >( 10000000 );
   Kokkos::DefaultExecutionSpace::fence();
 }
 
-
-//----------------------------------------------------------------------------
-
-TEST_F( defaultdevicetype , compiler_macros )
+TEST_F( defaultdevicetype, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::DefaultExecutionSpace >() ) );
 }
 
-
-//----------------------------------------------------------------------------
-TEST_F( defaultdevicetype , cxx11 )
+TEST_F( defaultdevicetype, cxx11 )
 {
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(1) ) );
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(2) ) );
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(3) ) );
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(4) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 1 ) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 2 ) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 3 ) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 4 ) ) );
 }
 
-TEST_F( defaultdevicetype , team_vector )
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND) && !defined(KOKKOS_ARCH_PASCAL)
+TEST_F( defaultdevicetype, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 5 ) ) );
 }
+#endif
 
-TEST_F( defaultdevicetype , malloc )
+TEST_F( defaultdevicetype, malloc )
 {
-  int* data = (int*) Kokkos::kokkos_malloc(100*sizeof(int));
-  ASSERT_NO_THROW(data = (int*) Kokkos::kokkos_realloc(data,120*sizeof(int)));
-  Kokkos::kokkos_free(data);
+  int* data = (int*) Kokkos::kokkos_malloc( 100 * sizeof( int ) );
+  ASSERT_NO_THROW( data = (int*) Kokkos::kokkos_realloc( data, 120 * sizeof( int ) ) );
+  Kokkos::kokkos_free( data );
 
-  int* data2 = (int*) Kokkos::kokkos_malloc(0);
-  ASSERT_TRUE(data2==NULL);
-  Kokkos::kokkos_free(data2);
+  int* data2 = (int*) Kokkos::kokkos_malloc( 0 );
+  ASSERT_TRUE( data2 == NULL );
+  Kokkos::kokkos_free( data2 );
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestHWLOC.cpp b/lib/kokkos/core/unit_test/TestHWLOC.cpp
index 1637dec5de4ff762cfbd259ee47932b5e85eb4d0..d03d9b816f9c3ac3ee85b61886baa243e5160714 100644
--- a/lib/kokkos/core/unit_test/TestHWLOC.cpp
+++ b/lib/kokkos/core/unit_test/TestHWLOC.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -44,26 +44,24 @@
 #include <gtest/gtest.h>
 
 #include <iostream>
+
 #include <Kokkos_hwloc.hpp>
 
 namespace Test {
 
 class hwloc : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {}
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {}
+  static void TearDownTestCase() {}
 };
 
-TEST_F( hwloc, query)
+TEST_F( hwloc, query )
 {
   std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]"
             << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]"
             << " PU[" << Kokkos::hwloc::get_available_threads_per_core()  << "]"
-            << std::endl ;
-}
-
+            << std::endl;
 }
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index 9894d1ce697c1f109163f7711e62f12cfceef703..1dc349cc1268e680aabc0859a771c7a786a388de 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -47,509 +47,1675 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
+
 namespace {
 
 template <typename ExecSpace >
 struct TestMDRange_2D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
 
-  using DataType     = int ;
-  using ViewType     = typename Kokkos::View< DataType** ,  ExecSpace > ;
-  using HostViewType = typename ViewType::HostMirror ;
+  ViewType input_view;
 
-  ViewType input_view ;
+  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view( "input_view", N0, N1 ) {}
 
-  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view("input_view", N0, N1) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i, j ) = 1;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , const int j ) const
+  void operator()( const int i, const int j, double &lsum ) const
   {
-    input_view(i,j) = 1;
+    lsum += input_view( i, j ) * 2;
   }
 
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j ) const
+  {
+    input_view( i, j ) = 3;
+  }
 
-  static void test_for2( const int64_t N0, const int64_t N1 )
+  static void test_reduce2( const int N0, const int N1 )
   {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
 
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+  } // end test_reduce2
+
+  static void test_for2( const int N0, const int N1 )
+  {
     using namespace Kokkos::Experimental;
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> >;
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, InitTag > range_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op() + Default Tile: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "No info: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 4, 4 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "D D: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {3,3} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "L L: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {7,7} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 7, 7 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "L R: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {16,16} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 16, 16 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "R L: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {5,16} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 5, 16 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "R R: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
-
-  } //end test_for2
-}; //MDRange_2D
+  } // end test_for2
+}; // MDRange_2D
 
 template <typename ExecSpace >
 struct TestMDRange_3D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
 
-  using DataType = int ;
-  using ViewType     = typename Kokkos::View< DataType*** ,  ExecSpace > ;
-  using HostViewType = typename ViewType::HostMirror ;
+  ViewType input_view;
 
-  ViewType input_view ;
+  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0, N1, N2 ) {}
 
-  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view("input_view", N0, N1, N2) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, double &lsum ) const
+  {
+    lsum += input_view( i, j, k ) * 2;
+  }
 
+  // tagged operators
+  struct InitTag {};
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , const int j , const int k ) const
+  void operator()( const InitTag &, const int i, const int j, const int k ) const
   {
-    input_view(i,j,k) = 1;
+    input_view( i, j, k ) = 3;
   }
 
-  static void test_for3( const int64_t N0, const int64_t N1, const int64_t N2 )
+  static void test_reduce3( const int N0, const int N1, const int N2 )
   {
     using namespace Kokkos::Experimental;
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+  } // end test_reduce3
+
+  static void test_for3( const int N0, const int N1, const int N2 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 5, 7 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Default>, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 8, 8, 8 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  } // end test_for3
+};
+
+template <typename ExecSpace >
+struct TestMDRange_4D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+
+  TestMDRange_4D( const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0, N1, N2, N3 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i, j, k, l ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i, j, k, l ) = 3;
+  }
+
+  static void test_for4( const int N0, const int N1, const int N2, const int N3 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4> > range_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } } );
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {3,5,7} );
-      TestMDRange_3D functor(N0,N1,N2);
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {8,8,8} );
-      TestMDRange_3D functor(N0,N1,N2);
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
-  } //end test_for3
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  } // end test_for4
 };
 
-} /* namespace */
-} /* namespace Test */
+template <typename ExecSpace >
+struct TestMDRange_5D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType*****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+
+  TestMDRange_5D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0, N1, N2, N3, N4 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i, j, k, l, m ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i, j, k, l, m ) = 3;
+  }
+
+  static void test_for5( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } } );
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 7 } } );
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_6D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType******, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+
+  TestMDRange_6D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0, N1, N2, N3, N4, N5 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i, j, k, l, m, n ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m, n ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i, j, k, l, m, n ) = 3;
+  }
+
+  static void test_for6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } } );
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  }
+};
 
-/*--------------------------------------------------------------------------*/
+} // namespace
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
index 868e64e9da5e46ee0d06f59736a0f4b20d576ee0..925f0e35ed6d12d3a822daa63421827fe636c86c 100644
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -156,7 +156,7 @@ struct fill_memory {
   void operator()( size_type i ) const
   {
     if ( i % STRIDE == 0 ) {
-      *m_pointers[i / STRIDE].ptr = i / STRIDE ;
+      *m_pointers[i / STRIDE].ptr = i / STRIDE;
     }
   }
 };
@@ -493,12 +493,12 @@ T smallest_power2_ge( T val )
   // Find the most significant nonzero bit.
   int first_nonzero_bit = Kokkos::Impl::bit_scan_reverse( val );
 
-  // If val is an integral power of 2, ceil( log2(val) ) is equal to the
+  // If val is an integral power of 2, ceil( log2( val ) ) is equal to the
   // most significant nonzero bit.  Otherwise, you need to add 1.
   int lg2_size = first_nonzero_bit +
                  !Kokkos::Impl::is_integral_power_of_two( val );
 
-  return T(1) << T(lg2_size);
+  return T( 1 ) << T( lg2_size );
 }
 
 // This test makes allocation requests for multiple sizes and interleaves
@@ -547,7 +547,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   phase1_size = ( ( phase1_size + num_chunk_sizes - 1 ) / num_chunk_sizes ) *
                 num_chunk_sizes;
 
-  // Make sure the phase 2 size is multiples of (2 * num_chunk_sizes).
+  // Make sure the phase 2 size is multiples of ( 2 * num_chunk_sizes ).
   phase2_size =
     ( ( phase2_size + 2 * num_chunk_sizes - 1 ) / ( 2 * num_chunk_sizes ) ) *
     2 * num_chunk_sizes;
@@ -567,7 +567,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   // each chunk size.
   work_view phase1_work( "Phase 1 Work", phase1_size );
   typename work_view::HostMirror host_phase1_work =
-    create_mirror_view(phase1_work);
+    create_mirror_view( phase1_work );
 
   size_t inner_size = phase1_size / num_chunk_sizes;
   unsigned chunk_size = base_chunk_size;
@@ -589,7 +589,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   // deallocations with an equal number of allocations for each chunk size.
   work_view phase2_work( "Phase 2 Work", phase2_size );
   typename work_view::HostMirror host_phase2_work =
-    create_mirror_view(phase2_work);
+    create_mirror_view( phase2_work );
 
   inner_size = half_phase2_size / num_chunk_sizes;
   chunk_size = base_chunk_size;
@@ -614,7 +614,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   // Initialize the phase 3 work view with all deallocations.
   work_view phase3_work( "Phase 3 Work", phase3_size );
   typename work_view::HostMirror host_phase3_work =
-    create_mirror_view(phase3_work);
+    create_mirror_view( phase3_work );
 
   inner_size = phase3_size / num_chunk_sizes;
 
diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 1bb45481c9b76d6dde29ff9e9d192d5ae4531829..6f2ca6a61c34b84f96cefd1195a6a11e2a6d32d1 100644
--- a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -48,7 +48,7 @@
 #include <sstream>
 #include <iostream>
 
-struct SomeTag{};
+struct SomeTag {};
 
 template< class ExecutionSpace >
 class TestRangePolicyConstruction {
@@ -56,179 +56,194 @@ public:
   TestRangePolicyConstruction() {
     test_compile_time_parameters();
   }
+
 private:
   void test_compile_time_parameters() {
     {
       Kokkos::Impl::expand_variadic();
-      Kokkos::Impl::expand_variadic(1,2,3);
+      Kokkos::Impl::expand_variadic( 1, 2, 3 );
     }
+
     {
       typedef Kokkos::RangePolicy<> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::IndexType<long>, ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,ExecutionSpace,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
   }
 };
@@ -240,258 +255,274 @@ public:
     test_compile_time_parameters();
     test_run_time_parameters();
   }
+
 private:
   void test_compile_time_parameters() {
     {
       typedef Kokkos::TeamPolicy<> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,ExecutionSpace,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace        >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
   }
 
 
-  template<class policy_t>
+  template< class policy_t >
   void test_run_time_parameters_type() {
     int league_size = 131;
-    int team_size = 4<policy_t::execution_space::concurrency()?4:policy_t::execution_space::concurrency();
+    int team_size = 4 < policy_t::execution_space::concurrency() ? 4 : policy_t::execution_space::concurrency();
     int chunk_size = 4;
     int per_team_scratch = 1024;
     int per_thread_scratch = 16;
-    int scratch_size = per_team_scratch + per_thread_scratch*team_size;
-    policy_t p1(league_size,team_size);
-    ASSERT_EQ  (p1.league_size() , league_size);
-    ASSERT_EQ  (p1.team_size()   , team_size);
-    ASSERT_TRUE(p1.chunk_size()  > 0);
-    ASSERT_EQ  (p1.scratch_size(0), 0);
-
-    policy_t p2 = p1.set_chunk_size(chunk_size);
-    ASSERT_EQ  (p1.league_size() , league_size);
-    ASSERT_EQ  (p1.team_size()   , team_size);
-    ASSERT_TRUE(p1.chunk_size()  > 0);
-    ASSERT_EQ  (p1.scratch_size(0), 0);
-
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-
-    policy_t p3 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p3.league_size() , league_size);
-    ASSERT_EQ  (p3.team_size()   , team_size);
-    ASSERT_EQ  (p3.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
-
-    policy_t p4 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p4.league_size() , league_size);
-    ASSERT_EQ  (p4.team_size()   , team_size);
-    ASSERT_EQ  (p4.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p4.scratch_size(0), per_thread_scratch*team_size);
-
-    policy_t p5 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch),Kokkos::PerTeam(per_team_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p5.league_size() , league_size);
-    ASSERT_EQ  (p5.team_size()   , team_size);
-    ASSERT_EQ  (p5.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p5.scratch_size(0), scratch_size);
-
-    policy_t p6 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p6.league_size() , league_size);
-    ASSERT_EQ  (p6.team_size()   , team_size);
-    ASSERT_EQ  (p6.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p6.scratch_size(0), scratch_size);
-
-    policy_t p7 = p3.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
-    ASSERT_EQ  (p3.league_size() , league_size);
-    ASSERT_EQ  (p3.team_size()   , team_size);
-    ASSERT_EQ  (p3.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
-    ASSERT_EQ  (p7.league_size() , league_size);
-    ASSERT_EQ  (p7.team_size()   , team_size);
-    ASSERT_EQ  (p7.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p7.scratch_size(0), scratch_size);
-}
+    int scratch_size = per_team_scratch + per_thread_scratch * team_size;
+
+    policy_t p1( league_size, team_size );
+    ASSERT_EQ  ( p1.league_size(),     league_size                    );
+    ASSERT_EQ  ( p1.team_size(),       team_size                      );
+    ASSERT_TRUE( p1.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p1.scratch_size( 0 ), 0                              );
+
+    policy_t p2 = p1.set_chunk_size( chunk_size );
+    ASSERT_EQ  ( p1.league_size(),     league_size                    );
+    ASSERT_EQ  ( p1.team_size(),       team_size                      );
+    ASSERT_TRUE( p1.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p1.scratch_size( 0 ), 0                              );
+
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+
+    policy_t p3 = p2.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p3.league_size(),     league_size                    );
+    ASSERT_EQ  ( p3.team_size(),       team_size                      );
+    ASSERT_EQ  ( p3.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p3.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p4 = p2.set_scratch_size( 0, Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p4.league_size(),     league_size                    );
+    ASSERT_EQ  ( p4.team_size(),       team_size                      );
+    ASSERT_EQ  ( p4.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p4.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p5 = p2.set_scratch_size( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p5.league_size(),     league_size                    );
+    ASSERT_EQ  ( p5.team_size(),       team_size                      );
+    ASSERT_EQ  ( p5.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p5.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p6 = p2.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p6.league_size(),     league_size                    );
+    ASSERT_EQ  ( p6.team_size(),       team_size                      );
+    ASSERT_EQ  ( p6.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p6.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p7 = p3.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p3.league_size(),     league_size                    );
+    ASSERT_EQ  ( p3.team_size(),       team_size                      );
+    ASSERT_EQ  ( p3.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p3.scratch_size( 0 ), per_team_scratch               );
+    ASSERT_EQ  ( p7.league_size(),     league_size                    );
+    ASSERT_EQ  ( p7.team_size(),       team_size                      );
+    ASSERT_EQ  ( p7.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p7.scratch_size( 0 ), scratch_size                   );
+  }
+
   void test_run_time_parameters() {
-    test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace> >();
-    test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > >();
-    test_run_time_parameters_type<Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > >();
-    test_run_time_parameters_type<Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace,SomeTag > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<ExecutionSpace> >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace, SomeTag > >();
   }
 };
diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp
deleted file mode 100644
index a465f39ca8ab428b72b68c103ec3989c92fb670f..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/unit_test/TestQthread.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <gtest/gtest.h>
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Qthread.hpp>
-
-//----------------------------------------------------------------------------
-
-#include <TestAtomic.hpp>
-
-#include <TestViewAPI.hpp>
-#include <TestViewOfClass.hpp>
-
-#include <TestTeam.hpp>
-#include <TestRange.hpp>
-#include <TestReduce.hpp>
-#include <TestScan.hpp>
-#include <TestAggregate.hpp>
-#include <TestCompilerMacros.hpp>
-#include <TestTaskScheduler.hpp>
-// #include <TestTeamVector.hpp>
-
-namespace Test {
-
-class qthread : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-    int threads_count = std::max( 1u , numa_count )
-                      * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
-    Kokkos::Qthread::initialize( threads_count );
-    Kokkos::Qthread::print_configuration( std::cout , true );
-  }
-
-  static void TearDownTestCase()
-  {
-    Kokkos::Qthread::finalize();
-  }
-};
-
-TEST_F( qthread , compiler_macros )
-{
-  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthread >() ) );
-}
-
-TEST_F( qthread, view_impl) {
-  test_view_impl< Kokkos::Qthread >();
-}
-
-TEST_F( qthread, view_api) {
-  TestViewAPI< double , Kokkos::Qthread >();
-}
-
-TEST_F( qthread , view_nested_view )
-{
-  ::Test::view_nested_view< Kokkos::Qthread >();
-}
-
-TEST_F( qthread , range_tag )
-{
-  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-}
-
-TEST_F( qthread , team_tag )
-{
-  TestTeamPolicy< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
-  TestTeamPolicy< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
-}
-
-TEST_F( qthread, long_reduce) {
-  TestReduce< long ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, double_reduce) {
-  TestReduce< double ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 1000000 );
-}
-
-TEST_F( qthread, team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 1000000 );
-}
-
-
-TEST_F( qthread , atomics )
-{
-  const int loop_count = 1e4 ;
-
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,3) ) );
-
-#if defined( KOKKOS_ENABLE_ASM )
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,3) ) );
-#endif
-
-}
-
-TEST_F( qthread , view_remap )
-{
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::Qthread > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Qthread > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Qthread > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( qthread , view_aggregate )
-{
-  TestViewAggregate< Kokkos::Qthread >();
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( qthread , scan )
-{
-  TestScan< Kokkos::Qthread >::test_range( 1 , 1000 );
-  TestScan< Kokkos::Qthread >( 1000000 );
-  TestScan< Kokkos::Qthread >( 10000000 );
-  Kokkos::Qthread::fence();
-}
-
-TEST_F( qthread, team_shared ) {
-  TestSharedTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >();
-}
-
-TEST_F( qthread, shmem_size) {
-  TestShmemSize< Kokkos::Qthread >();
-}
-
-TEST_F( qthread , team_scan )
-{
-  TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-}
-
-#if 0 /* disable */
-TEST_F( qthread , team_vector )
-{
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(4) ) );
-}
-#endif
-
-//----------------------------------------------------------------------------
-
-TEST_F( qthread , task_policy )
-{
-  TestTaskScheduler::test_task_dep< Kokkos::Qthread >( 10 );
-  for ( long i = 0 ; i < 25 ; ++i ) TestTaskScheduler::test_fib< Kokkos::Qthread >(i);
-  for ( long i = 0 ; i < 35 ; ++i ) TestTaskScheduler::test_fib2< Kokkos::Qthread >(i);
-}
-
-TEST_F( qthread , task_team )
-{
-  TestTaskScheduler::test_task_team< Kokkos::Qthread >(1000);
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace test
-
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
index e342e844c7665650732a38e49063abee626a4a8c..90411a57a0c9c871f946dd3a8b04b4af0554b380 100644
--- a/lib/kokkos/core/unit_test/TestRange.hpp
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,198 +45,204 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
+
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestRange {
+  typedef int value_type; ///< typedef required for the parallel_reduce
 
-  typedef int value_type ; ///< typedef required for the parallel_reduce
-
-  typedef Kokkos::View<int*,ExecSpace> view_type ;
+  typedef Kokkos::View< int*, ExecSpace > view_type;
 
-  view_type m_flags ;
+  view_type m_flags;
 
   struct VerifyInitTag {};
   struct ResetTag {};
   struct VerifyResetTag {};
 
   TestRange( const size_t N )
-    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags"), N )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ), N )
     {}
 
   static void test_for( const size_t N )
-    {
-      TestRange functor(N);
+  {
+    TestRange functor( N );
 
-      typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags );
+    typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags );
 
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType,VerifyInitTag>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyInitTag >( 0, N ), functor );
 
-      Kokkos::deep_copy( host_flags , functor.m_flags );
+    Kokkos::deep_copy( host_flags, functor.m_flags );
 
-      size_t error_count = 0 ;
-      for ( size_t i = 0 ; i < N ; ++i ) {
-        if ( int(i) != host_flags(i) ) ++error_count ;
-      }
-      ASSERT_EQ( error_count , size_t(0) );
+    size_t error_count = 0;
+    for ( size_t i = 0; i < N; ++i ) {
+      if ( int( i ) != host_flags( i ) ) ++error_count;
+    }
+    ASSERT_EQ( error_count, size_t( 0 ) );
 
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType,ResetTag>(0,N) , functor );
-      Kokkos::parallel_for( std::string("TestKernelFor") , Kokkos::RangePolicy<ExecSpace,ScheduleType,VerifyResetTag>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, ResetTag >( 0, N ), functor );
+    Kokkos::parallel_for( std::string( "TestKernelFor" ), Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyResetTag >( 0, N ), functor );
 
-      Kokkos::deep_copy( host_flags , functor.m_flags );
+    Kokkos::deep_copy( host_flags, functor.m_flags );
 
-      error_count = 0 ;
-      for ( size_t i = 0 ; i < N ; ++i ) {
-        if ( int(2*i) != host_flags(i) ) ++error_count ;
-      }
-      ASSERT_EQ( error_count , size_t(0) );
+    error_count = 0;
+    for ( size_t i = 0; i < N; ++i ) {
+      if ( int( 2 * i ) != host_flags( i ) ) ++error_count;
     }
+    ASSERT_EQ( error_count, size_t( 0 ) );
+  }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const int i ) const
-    { m_flags(i) = i ; }
+  { m_flags( i )  = i; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const VerifyInitTag & , const int i ) const
-    { if ( i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+  void operator()( const VerifyInitTag &, const int i ) const
+  {
+    if ( i != m_flags( i ) ) {
+      printf( "TestRange::test_for error at %d != %d\n", i, m_flags( i ) );
+    }
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const ResetTag & , const int i ) const
-    { m_flags(i) = 2 * m_flags(i); }
+  void operator()( const ResetTag &, const int i ) const
+  { m_flags( i ) = 2 * m_flags( i ); }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const VerifyResetTag & , const int i ) const
-    { if ( 2 * i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+  void operator()( const VerifyResetTag &, const int i ) const
+  {
+    if ( 2 * i != m_flags( i ) )
+    {
+      printf( "TestRange::test_for error at %d != %d\n", i, m_flags( i ) );
+    }
+  }
 
   //----------------------------------------
 
   struct OffsetTag {};
 
   static void test_reduce( const size_t N )
-    {
-      TestRange functor(N);
-      int total = 0 ;
+  {
+    TestRange functor( N );
+    int total = 0;
 
-      Kokkos::parallel_for(    Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor );
 
-      Kokkos::parallel_reduce( "TestKernelReduce" , Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor , total );
-      // sum( 0 .. N-1 )
-      ASSERT_EQ( size_t((N-1)*(N)/2) , size_t(total) );
+    Kokkos::parallel_reduce( "TestKernelReduce", Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor, total );
+    // sum( 0 .. N-1 )
+    ASSERT_EQ( size_t( ( N - 1 ) * ( N ) / 2 ), size_t( total ) );
 
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,ScheduleType,OffsetTag>(0,N) , functor , total );
-      // sum( 1 .. N )
-      ASSERT_EQ( size_t((N)*(N+1)/2) , size_t(total) );
-    }
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), functor, total );
+    // sum( 1 .. N )
+    ASSERT_EQ( size_t( ( N ) * ( N + 1 ) / 2 ), size_t( total ) );
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , value_type & update ) const
-    { update += m_flags(i); }
+  void operator()( const int i, value_type & update ) const
+  { update += m_flags( i ); }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const OffsetTag & , const int i , value_type & update ) const
-    { update += 1 + m_flags(i); }
+  void operator()( const OffsetTag &, const int i, value_type & update ) const
+  { update += 1 + m_flags( i ); }
 
   //----------------------------------------
 
   static void test_scan( const size_t N )
-    {
-      TestRange functor(N);
+  {
+    TestRange functor( N );
 
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor );
 
-      Kokkos::parallel_scan( "TestKernelScan" , Kokkos::RangePolicy<ExecSpace,ScheduleType,OffsetTag>(0,N) , functor );
-    }
+    Kokkos::parallel_scan( "TestKernelScan", Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), functor );
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const OffsetTag & , const int i , value_type & update , bool final ) const
-    {
-      update += m_flags(i);
+  void operator()( const OffsetTag &, const int i, value_type & update, bool final ) const
+  {
+    update += m_flags( i );
 
-      if ( final ) {
-        if ( update != (i*(i+1))/2 ) {
-          printf("TestRange::test_scan error %d : %d != %d\n",i,(i*(i+1))/2,m_flags(i));
-        }
+    if ( final ) {
+      if ( update != ( i * ( i + 1 ) ) / 2 ) {
+        printf( "TestRange::test_scan error %d : %d != %d\n", i, ( i * ( i + 1 ) ) / 2, m_flags( i ) );
       }
     }
+  }
 
-  static void test_dynamic_policy( const size_t N ) {
-
-
-    typedef Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+  static void test_dynamic_policy( const size_t N )
+  {
+    typedef Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
 
     {
-      Kokkos::View<size_t*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > count("Count",ExecSpace::concurrency());
-      Kokkos::View<int*,ExecSpace> a("A",N);
-
-      Kokkos::parallel_for( policy_t(0,N),
-          KOKKOS_LAMBDA (const typename policy_t::member_type& i) {
-        for(int k=0; k<(i<N/2?1:10000); k++ )
-          a(i)++;
-        count(ExecSpace::hardware_thread_id())++;
+      Kokkos::View< size_t*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > count( "Count", ExecSpace::concurrency() );
+      Kokkos::View< int*, ExecSpace > a( "A", N );
+
+      Kokkos::parallel_for( policy_t( 0, N ), KOKKOS_LAMBDA ( const typename policy_t::member_type& i ) {
+        for ( int k = 0; k < ( i < N / 2 ? 1 : 10000 ); k++ ) {
+          a( i )++;
+        }
+        count( ExecSpace::hardware_thread_id() )++;
       });
 
       int error = 0;
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N), KOKKOS_LAMBDA(const typename policy_t::member_type& i, int& lsum) {
-        lsum += ( a(i)!= (i<N/2?1:10000) );
-      },error);
-      ASSERT_EQ(error,0);
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), KOKKOS_LAMBDA( const typename policy_t::member_type & i, int & lsum ) {
+        lsum += ( a( i ) != ( i < N / 2 ? 1 : 10000 ) );
+      }, error );
+      ASSERT_EQ( error, 0 );
 
-      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<size_t>(4*ExecSpace::concurrency())) ) {
+      if ( ( ExecSpace::concurrency() > (int) 1 ) && ( N > static_cast<size_t>( 4 * ExecSpace::concurrency() ) ) ) {
         size_t min = N;
         size_t max = 0;
-        for(int t=0; t<ExecSpace::concurrency(); t++) {
-          if(count(t)<min) min = count(t);
-          if(count(t)>max) max = count(t);
+        for ( int t = 0; t < ExecSpace::concurrency(); t++ ) {
+          if ( count( t ) < min ) min = count( t );
+          if ( count( t ) > max ) max = count( t );
         }
-        ASSERT_TRUE(min<max);
-        //if(ExecSpace::concurrency()>2)
-        //  ASSERT_TRUE(2*min<max);
+        ASSERT_TRUE( min < max );
+
+        //if ( ExecSpace::concurrency() > 2 ) {
+        //  ASSERT_TRUE( 2 * min < max );
+        //}
       }
-      
     }
 
     {
-      Kokkos::View<size_t*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > count("Count",ExecSpace::concurrency());
-      Kokkos::View<int*,ExecSpace> a("A",N);
+      Kokkos::View< size_t*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > count( "Count", ExecSpace::concurrency() );
+      Kokkos::View< int*, ExecSpace> a( "A", N );
 
       int sum = 0;
-      Kokkos::parallel_reduce( policy_t(0,N),
-          KOKKOS_LAMBDA (const typename policy_t::member_type& i, int& lsum) {
-        for(int k=0; k<(i<N/2?1:10000); k++ )
-          a(i)++;
-        count(ExecSpace::hardware_thread_id())++;
+      Kokkos::parallel_reduce( policy_t( 0, N ), KOKKOS_LAMBDA( const typename policy_t::member_type & i, int & lsum ) {
+        for ( int k = 0; k < ( i < N / 2 ? 1 : 10000 ); k++ ) {
+          a( i )++;
+        }
+        count( ExecSpace::hardware_thread_id() )++;
         lsum++;
-      },sum);
-      ASSERT_EQ(sum,N);
+      }, sum );
+      ASSERT_EQ( sum, N );
 
       int error = 0;
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N), KOKKOS_LAMBDA(const typename policy_t::member_type& i, int& lsum) {
-        lsum += ( a(i)!= (i<N/2?1:10000) );
-      },error);
-      ASSERT_EQ(error,0);
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), KOKKOS_LAMBDA( const typename policy_t::member_type & i, int & lsum ) {
+        lsum += ( a( i ) != ( i < N / 2 ? 1 : 10000 ) );
+      }, error );
+      ASSERT_EQ( error, 0 );
 
-      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<size_t>(4*ExecSpace::concurrency())) ) {
+      if ( ( ExecSpace::concurrency() > (int) 1 ) && ( N > static_cast<size_t>( 4 * ExecSpace::concurrency() ) ) ) {
         size_t min = N;
         size_t max = 0;
-        for(int t=0; t<ExecSpace::concurrency(); t++) {
-          if(count(t)<min) min = count(t);
-          if(count(t)>max) max = count(t);
+        for ( int t = 0; t < ExecSpace::concurrency(); t++ ) {
+          if ( count( t ) < min ) min = count( t );
+          if ( count( t ) > max ) max = count( t );
         }
-        ASSERT_TRUE(min<max);
-        //if(ExecSpace::concurrency()>2)
-        //  ASSERT_TRUE(2*min<max);
+        ASSERT_TRUE( min < max );
+
+        //if ( ExecSpace::concurrency() > 2 ) {
+        //  ASSERT_TRUE( 2 * min < max );
+        //}
       }
     }
-
   }
 };
 
-} /* namespace */
-} /* namespace Test */
-
-/*--------------------------------------------------------------------------*/
+} // namespace
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
index 645fc9e31b3b1cf86d06779304343cc93cc2242a..7e77dadf6249fe3eaa763c0c9848b93965379e7e 100644
--- a/lib/kokkos/core/unit_test/TestReduce.hpp
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -48,24 +48,23 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class ReduceFunctor
 {
 public:
-  typedef DeviceType  execution_space ;
-  typedef typename execution_space::size_type size_type ;
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   struct value_type {
-    ScalarType value[3] ;
+    ScalarType value[3];
   };
 
-  const size_type nwork ;
+  const size_type nwork;
 
-  ReduceFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+  ReduceFunctor( const size_type & arg_nwork )
+    : nwork( arg_nwork ) {}
 
   ReduceFunctor( const ReduceFunctor & rhs )
     : nwork( rhs.nwork ) {}
@@ -74,66 +73,63 @@ public:
   KOKKOS_INLINE_FUNCTION
   void init( value_type & dst ) const
   {
-    dst.value[0] = 0 ;
-    dst.value[1] = 0 ;
-    dst.value[2] = 0 ;
+    dst.value[0] = 0;
+    dst.value[1] = 0;
+    dst.value[2] = 0;
   }
 */
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile value_type & dst ,
+  void join( volatile value_type & dst,
              const volatile value_type & src ) const
   {
-    dst.value[0] += src.value[0] ;
-    dst.value[1] += src.value[1] ;
-    dst.value[2] += src.value[2] ;
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type iwork , value_type & dst ) const
+  void operator()( size_type iwork, value_type & dst ) const
   {
-    dst.value[0] += 1 ;
-    dst.value[1] += iwork + 1 ;
-    dst.value[2] += nwork - iwork ;
+    dst.value[0] += 1;
+    dst.value[1] += iwork + 1;
+    dst.value[2] += nwork - iwork;
   }
 };
 
 template< class DeviceType >
-class ReduceFunctorFinal : public ReduceFunctor< long , DeviceType > {
+class ReduceFunctorFinal : public ReduceFunctor< long, DeviceType > {
 public:
-
-  typedef typename ReduceFunctor< long , DeviceType >::value_type value_type ;
+  typedef typename ReduceFunctor< long, DeviceType >::value_type value_type;
 
   ReduceFunctorFinal( const size_t n )
-    : ReduceFunctor<long,DeviceType>(n)
-    {}
+    : ReduceFunctor< long, DeviceType >( n ) {}
 
   KOKKOS_INLINE_FUNCTION
   void final( value_type & dst ) const
   {
-    dst.value[0] = - dst.value[0] ;
-    dst.value[1] = - dst.value[1] ;
-    dst.value[2] = - dst.value[2] ;
+    dst.value[0] = -dst.value[0];
+    dst.value[1] = -dst.value[1];
+    dst.value[2] = -dst.value[2];
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class RuntimeReduceFunctor
 {
 public:
   // Required for functor:
-  typedef DeviceType  execution_space ;
-  typedef ScalarType  value_type[] ;
-  const unsigned      value_count ;
-
+  typedef DeviceType  execution_space;
+  typedef ScalarType  value_type[];
+  const unsigned      value_count;
 
   // Unit test details:
 
-  typedef typename execution_space::size_type  size_type ;
+  typedef typename execution_space::size_type size_type;
 
-  const size_type     nwork ;
+  const size_type     nwork;
 
-  RuntimeReduceFunctor( const size_type arg_nwork ,
+  RuntimeReduceFunctor( const size_type arg_nwork,
                         const size_type arg_count )
     : value_count( arg_count )
     , nwork( arg_nwork ) {}
@@ -141,247 +137,251 @@ public:
   KOKKOS_INLINE_FUNCTION
   void init( ScalarType dst[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] = 0;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile ScalarType dst[] ,
+  void join( volatile ScalarType dst[],
              const volatile ScalarType src[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] += src[i];
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type iwork , ScalarType dst[] ) const
+  void operator()( size_type iwork, ScalarType dst[] ) const
   {
-    const size_type tmp[3] = { 1 , iwork + 1 , nwork - iwork };
+    const size_type tmp[3] = { 1, iwork + 1, nwork - iwork };
 
-    for ( size_type i = 0 ; i < value_count ; ++i ) {
+    for ( size_type i = 0; i < value_count; ++i ) {
       dst[i] += tmp[ i % 3 ];
     }
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class RuntimeReduceMinMax
 {
 public:
   // Required for functor:
-  typedef DeviceType  execution_space ;
-  typedef ScalarType  value_type[] ;
-  const unsigned      value_count ;
+  typedef DeviceType  execution_space;
+  typedef ScalarType  value_type[];
+  const unsigned      value_count;
 
   // Unit test details:
 
-  typedef typename execution_space::size_type  size_type ;
+  typedef typename execution_space::size_type size_type;
 
-  const size_type     nwork ;
-  const ScalarType    amin ;
-  const ScalarType    amax ;
+  const size_type     nwork;
+  const ScalarType    amin;
+  const ScalarType    amax;
 
-  RuntimeReduceMinMax( const size_type arg_nwork ,
+  RuntimeReduceMinMax( const size_type arg_nwork,
                        const size_type arg_count )
     : value_count( arg_count )
     , nwork( arg_nwork )
-    , amin( std::numeric_limits<ScalarType>::min() )
-    , amax( std::numeric_limits<ScalarType>::max() )
+    , amin( std::numeric_limits< ScalarType >::min() )
+    , amax( std::numeric_limits< ScalarType >::max() )
     {}
 
   KOKKOS_INLINE_FUNCTION
   void init( ScalarType dst[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) {
-      dst[i] = i % 2 ? amax : amin ;
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? amax : amin;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile ScalarType dst[] ,
+  void join( volatile ScalarType dst[],
              const volatile ScalarType src[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) {
+    for ( unsigned i = 0; i < value_count; ++i ) {
       dst[i] = i % 2 ? ( dst[i] < src[i] ? dst[i] : src[i] )  // min
                      : ( dst[i] > src[i] ? dst[i] : src[i] ); // max
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type iwork , ScalarType dst[] ) const
+  void operator()( size_type iwork, ScalarType dst[] ) const
   {
-    const ScalarType tmp[2] = { ScalarType(iwork + 1)
-                              , ScalarType(nwork - iwork) };
+    const ScalarType tmp[2] = { ScalarType( iwork + 1 )
+                              , ScalarType( nwork - iwork ) };
 
-    for ( size_type i = 0 ; i < value_count ; ++i ) {
-      dst[i] = i % 2 ? ( dst[i] < tmp[i%2] ? dst[i] : tmp[i%2] )
-                     : ( dst[i] > tmp[i%2] ? dst[i] : tmp[i%2] );
+    for ( size_type i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? ( dst[i] < tmp[i % 2] ? dst[i] : tmp[i % 2] )
+                     : ( dst[i] > tmp[i % 2] ? dst[i] : tmp[i % 2] );
     }
   }
 };
 
 template< class DeviceType >
-class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long , DeviceType > {
+class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long, DeviceType > {
 public:
+  typedef RuntimeReduceFunctor< long, DeviceType > base_type;
+  typedef typename base_type::value_type value_type;
+  typedef long scalar_type;
 
-  typedef RuntimeReduceFunctor< long , DeviceType > base_type ;
-  typedef typename base_type::value_type value_type ;
-  typedef long scalar_type ;
-
-  RuntimeReduceFunctorFinal( const size_t theNwork , const size_t count ) : base_type(theNwork,count) {}
+  RuntimeReduceFunctorFinal( const size_t theNwork, const size_t count )
+    : base_type( theNwork, count ) {}
 
   KOKKOS_INLINE_FUNCTION
   void final( value_type dst ) const
   {
-    for ( unsigned i = 0 ; i < base_type::value_count ; ++i ) {
-      dst[i] = - dst[i] ;
+    for ( unsigned i = 0; i < base_type::value_count; ++i ) {
+      dst[i] = -dst[i];
     }
   }
 };
+
 } // namespace Test
 
 namespace {
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestReduce
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef typename execution_space::size_type size_type ;
-
-  //------------------------------------
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   TestReduce( const size_type & nwork )
   {
-    run_test(nwork);
-    run_test_final(nwork);
+    run_test( nwork );
+    run_test_final( nwork );
   }
 
   void run_test( const size_type & nwork )
   {
-    typedef Test::ReduceFunctor< ScalarType , execution_space > functor_type ;
-    typedef typename functor_type::value_type value_type ;
+    typedef Test::ReduceFunctor< ScalarType, execution_space > functor_type;
+    typedef typename functor_type::value_type value_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
     value_type result[ Repeat ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      Kokkos::parallel_reduce( nwork, functor_type( nwork ), result[i] );
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
       }
     }
   }
 
   void run_test_final( const size_type & nwork )
   {
-    typedef Test::ReduceFunctorFinal< execution_space > functor_type ;
-    typedef typename functor_type::value_type value_type ;
+    typedef Test::ReduceFunctorFinal< execution_space > functor_type;
+    typedef typename functor_type::value_type value_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
     value_type result[ Repeat ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
-      else
-        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork ), result[i] );
+      }
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , - result[i].value[j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, -result[i].value[j] );
       }
     }
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestReduceDynamic
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef typename execution_space::size_type size_type ;
-
-  //------------------------------------
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   TestReduceDynamic( const size_type nwork )
   {
-    run_test_dynamic(nwork);
-    run_test_dynamic_minmax(nwork);
-    run_test_dynamic_final(nwork);
+    run_test_dynamic( nwork );
+    run_test_dynamic_minmax( nwork );
+    run_test_dynamic_final( nwork );
   }
 
   void run_test_dynamic( const size_type nwork )
   {
-    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+    typedef Test::RuntimeReduceFunctor< ScalarType, execution_space > functor_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
-    ScalarType result[ Repeat ][ Count ] ;
+    ScalarType result[ Repeat ][ Count ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
-      else
-        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork,Count) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , result[i][j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i][j] );
       }
     }
   }
 
   void run_test_dynamic_minmax( const size_type nwork )
   {
-    typedef Test::RuntimeReduceMinMax< ScalarType , execution_space > functor_type ;
+    typedef Test::RuntimeReduceMinMax< ScalarType, execution_space > functor_type;
 
     enum { Count = 2 };
     enum { Repeat = 100 };
 
-    ScalarType result[ Repeat ][ Count ] ;
+    ScalarType result[ Repeat ][ Count ];
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
-      else
-        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork,Count) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
         if ( nwork == 0 )
         {
-          ScalarType amin( std::numeric_limits<ScalarType>::min() );
-          ScalarType amax( std::numeric_limits<ScalarType>::max() );
-          const ScalarType correct = (j%2) ? amax : amin;
-          ASSERT_EQ( (ScalarType) correct , result[i][j] );
-        } else {
-          const unsigned long correct = j % 2 ? 1 : nwork ;
-          ASSERT_EQ( (ScalarType) correct , result[i][j] );
+          ScalarType amin( std::numeric_limits< ScalarType >::min() );
+          ScalarType amax( std::numeric_limits< ScalarType >::max() );
+          const ScalarType correct = ( j % 2 ) ? amax : amin;
+          ASSERT_EQ( (ScalarType) correct, result[i][j] );
+        }
+        else {
+          const unsigned long correct = j % 2 ? 1 : nwork;
+          ASSERT_EQ( (ScalarType) correct, result[i][j] );
         }
       }
     }
@@ -389,169 +389,172 @@ public:
 
   void run_test_dynamic_final( const size_type nwork )
   {
-    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type ;
+    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
-    typename functor_type::scalar_type result[ Repeat ][ Count ] ;
+    typename functor_type::scalar_type result[ Repeat ][ Count ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
-      else
-        Kokkos::parallel_reduce( "TestKernelReduce" , nwork , functor_type(nwork,Count) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "TestKernelReduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
 
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , - result[i][j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, -result[i][j] );
       }
     }
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestReduceDynamicView
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef typename execution_space::size_type size_type ;
-
-  //------------------------------------
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   TestReduceDynamicView( const size_type nwork )
   {
-    run_test_dynamic_view(nwork);
+    run_test_dynamic_view( nwork );
   }
 
   void run_test_dynamic_view( const size_type nwork )
   {
-    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+    typedef Test::RuntimeReduceFunctor< ScalarType, execution_space > functor_type;
 
-    typedef Kokkos::View< ScalarType* , DeviceType > result_type ;
-    typedef typename result_type::HostMirror result_host_type ;
+    typedef Kokkos::View< ScalarType*, DeviceType > result_type;
+    typedef typename result_type::HostMirror result_host_type;
 
-    const unsigned CountLimit = 23 ;
+    const unsigned CountLimit = 23;
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned count = 0 ; count < CountLimit ; ++count ) {
+    for ( unsigned count = 0; count < CountLimit; ++count ) {
 
-      result_type result("result",count);
+      result_type result( "result", count );
       result_host_type host_result = Kokkos::create_mirror( result );
 
       // Test result to host pointer:
 
-      std::string str("TestKernelReduce");
-      if(count%2==0)
-        Kokkos::parallel_reduce( nw , functor_type(nw,count) , host_result.ptr_on_device() );
-      else
-        Kokkos::parallel_reduce( str , nw , functor_type(nw,count) , host_result.ptr_on_device() );
+      std::string str( "TestKernelReduce" );
+      if ( count % 2 == 0 ) {
+        Kokkos::parallel_reduce( nw, functor_type( nw, count ), host_result.ptr_on_device() );
+      }
+      else {
+        Kokkos::parallel_reduce( str, nw, functor_type( nw, count ), host_result.ptr_on_device() );
+      }
 
-      for ( unsigned j = 0 ; j < count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( host_result(j), (ScalarType) correct );
-        host_result(j) = 0 ;
+      for ( unsigned j = 0; j < count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( host_result( j ), (ScalarType) correct );
+        host_result( j ) = 0;
       }
     }
   }
 };
-}
+
+} // namespace
 
 // Computes y^T*A*x
-// (modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
+// ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
 
 #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA )
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestTripleNestedReduce
 {
 public:
-  typedef DeviceType execution_space ;
-  typedef typename execution_space::size_type size_type ;
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
-  //------------------------------------
-
-  TestTripleNestedReduce( const size_type & nrows , const size_type & ncols
-                        , const size_type & team_size , const size_type & vector_length )
+  TestTripleNestedReduce( const size_type & nrows, const size_type & ncols
+                        , const size_type & team_size, const size_type & vector_length )
   {
-    run_test( nrows , ncols , team_size, vector_length );
+    run_test( nrows, ncols, team_size, vector_length );
   }
 
-  void run_test( const size_type & nrows , const size_type & ncols
+  void run_test( const size_type & nrows, const size_type & ncols
                , const size_type & team_size, const size_type & vector_length )
   {
     //typedef Kokkos::LayoutLeft Layout;
     typedef Kokkos::LayoutRight Layout;
 
-    typedef Kokkos::View<ScalarType* , DeviceType>            ViewVector;
-    typedef Kokkos::View<ScalarType** , Layout , DeviceType>   ViewMatrix;
-    ViewVector y( "y" , nrows );
-    ViewVector x( "x" , ncols );
-    ViewMatrix A( "A" , nrows , ncols );
+    typedef Kokkos::View< ScalarType*, DeviceType >            ViewVector;
+    typedef Kokkos::View< ScalarType**, Layout, DeviceType >   ViewMatrix;
+
+    ViewVector y( "y", nrows );
+    ViewVector x( "x", ncols );
+    ViewMatrix A( "A", nrows, ncols );
 
     typedef Kokkos::RangePolicy<DeviceType> range_policy;
 
-    // Initialize y vector
-    Kokkos::parallel_for( range_policy( 0 , nrows ) , KOKKOS_LAMBDA( const int i ) { y( i ) = 1; } );
+    // Initialize y vector.
+    Kokkos::parallel_for( range_policy( 0, nrows ), KOKKOS_LAMBDA ( const int i ) { y( i ) = 1; } );
 
-    // Initialize x vector
-    Kokkos::parallel_for( range_policy( 0 , ncols ) , KOKKOS_LAMBDA( const int i ) { x( i ) = 1; } );
+    // Initialize x vector.
+    Kokkos::parallel_for( range_policy( 0, ncols ), KOKKOS_LAMBDA ( const int i ) { x( i ) = 1; } );
 
-    typedef Kokkos::TeamPolicy<DeviceType>                        team_policy;
-    typedef typename Kokkos::TeamPolicy<DeviceType>::member_type  member_type;
+    typedef Kokkos::TeamPolicy< DeviceType >                        team_policy;
+    typedef typename Kokkos::TeamPolicy< DeviceType >::member_type  member_type;
 
-    // Initialize A matrix, note 2D indexing computation
-    Kokkos::parallel_for( team_policy( nrows , Kokkos::AUTO ) , KOKKOS_LAMBDA( const member_type& teamMember ) {
+    // Initialize A matrix, note 2D indexing computation.
+    Kokkos::parallel_for( team_policy( nrows, Kokkos::AUTO ), KOKKOS_LAMBDA ( const member_type & teamMember ) {
       const int j = teamMember.league_rank();
-      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , ncols ) , [&] ( const int i ) {
-        A( j , i ) = 1;
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, ncols ), [&] ( const int i ) {
+        A( j, i ) = 1;
       } );
     } );
 
-    // Three level parallelism kernel to force caching of vector x
+    // Three level parallelism kernel to force caching of vector x.
     ScalarType result = 0.0;
     int chunk_size = 128;
-    Kokkos::parallel_reduce( team_policy( nrows/chunk_size , team_size , vector_length ) , KOKKOS_LAMBDA ( const member_type& teamMember , double &update ) {
+    Kokkos::parallel_reduce( team_policy( nrows / chunk_size, team_size, vector_length ),
+                             KOKKOS_LAMBDA ( const member_type & teamMember, double & update ) {
       const int row_start = teamMember.league_rank() * chunk_size;
       const int row_end   = row_start + chunk_size;
-      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , row_start , row_end ) , [&] ( const int i ) {
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, row_start, row_end ), [&] ( const int i ) {
         ScalarType sum_i = 0.0;
-        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember , ncols ) , [&] ( const int j , ScalarType &innerUpdate ) {
-          innerUpdate += A( i , j ) * x( j );
-        } , sum_i );
-        Kokkos::single( Kokkos::PerThread( teamMember ) , [&] () {
+        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember, ncols ), [&] ( const int j, ScalarType &innerUpdate ) {
+          innerUpdate += A( i, j ) * x( j );
+        }, sum_i );
+        Kokkos::single( Kokkos::PerThread( teamMember ), [&] () {
           update += y( i ) * sum_i;
         } );
       } );
-    } , result );
+    }, result );
 
-    const ScalarType solution= ( ScalarType ) nrows * ( ScalarType ) ncols;
-    ASSERT_EQ( solution , result );
+    const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols;
+    ASSERT_EQ( solution, result );
   }
 };
 
-#else /* #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA ) */
+#else // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA )
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestTripleNestedReduce
 {
 public:
-  typedef DeviceType execution_space ;
-  typedef typename execution_space::size_type size_type ;
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
-  TestTripleNestedReduce( const size_type & , const size_type
-                        , const size_type & , const size_type )
-  { }
+  TestTripleNestedReduce( const size_type &, const size_type
+                        , const size_type &, const size_type )
+  {}
 };
 
 #endif
@@ -559,38 +562,38 @@ public:
 //--------------------------------------------------------------------------
 
 namespace Test {
+
 namespace ReduceCombinatorical {
 
-template<class Scalar,class Space = Kokkos::HostSpace>
+template< class Scalar, class Space = Kokkos::HostSpace >
 struct AddPlus {
 public:
-  //Required
+  // Required.
   typedef AddPlus reducer_type;
   typedef Scalar value_type;
 
-  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+  typedef Kokkos::View< value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
 
 private:
   result_view_type result;
 
 public:
+  AddPlus( value_type & result_ ) : result( &result_ ) {}
 
-  AddPlus(value_type& result_):result(&result_) {}
-
-  //Required
+  // Required.
   KOKKOS_INLINE_FUNCTION
-  void join(value_type& dest, const value_type& src)  const {
+  void join( value_type & dest, const value_type & src ) const {
     dest += src + 1;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
+  void join( volatile value_type & dest, const volatile value_type & src ) const {
     dest += src + 1;
   }
 
-  //Optional
+  // Optional.
   KOKKOS_INLINE_FUNCTION
-  void init( value_type& val)  const {
+  void init( value_type & val )  const {
     val = value_type();
   }
 
@@ -599,624 +602,651 @@ public:
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalar;
 
 template<>
-struct FunctorScalar<0>{
-  FunctorScalar(Kokkos::View<double> r):result(r) {}
-  Kokkos::View<double> result;
+struct FunctorScalar< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalar( Kokkos::View< double > r ) : result( r ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i,double& update) const {
-    update+=i;
+  void operator()( const int & i, double & update ) const {
+    update += i;
   }
 };
 
 template<>
-struct FunctorScalar<1>{
-  FunctorScalar(Kokkos::View<double> r):result(r) {}
-  Kokkos::View<double> result;
-
+struct FunctorScalar< 1 > {
   typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalar( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarInit;
 
 template<>
-struct FunctorScalarInit<0> {
-  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarInit< 0 > {
+  Kokkos::View< double > result;
 
-  Kokkos::View<double> result;
+  FunctorScalarInit( Kokkos::View< double > r ) : result( r ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
 template<>
-struct FunctorScalarInit<1> {
-  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarInit( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarFinal;
 
-
 template<>
-struct FunctorScalarFinal<0> {
-  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
-
+struct FunctorScalarFinal< 0 > {
   Kokkos::View<double> result;
+
+  FunctorScalarFinal( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
 template<>
-struct FunctorScalarFinal<1> {
-  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarFinal< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  FunctorScalarFinal( Kokkos::View< double > r ) : result( r ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team, double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
+
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoin;
 
 template<>
-struct FunctorScalarJoin<0> {
-  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
-
+struct FunctorScalarJoin< 0 > {
   Kokkos::View<double> result;
+
+  FunctorScalarJoin( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 };
 
 template<>
-struct FunctorScalarJoin<1> {
-  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoin< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoin( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoinFinal;
 
 template<>
-struct FunctorScalarJoinFinal<0> {
-  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinFinal< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinal( Kokkos::View< double > r ) : result( r ) {}
 
-  Kokkos::View<double> result;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
 template<>
-struct FunctorScalarJoinFinal<1> {
-  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinFinal< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinal( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoinInit;
 
 template<>
-struct FunctorScalarJoinInit<0> {
-  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinInit< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinInit( Kokkos::View< double > r ) : result( r ) {}
 
-  Kokkos::View<double> result;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
 template<>
-struct FunctorScalarJoinInit<1> {
-  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinInit( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoinFinalInit;
 
 template<>
-struct FunctorScalarJoinFinalInit<0> {
-  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
-
+struct FunctorScalarJoinFinalInit< 0 > {
   Kokkos::View<double> result;
 
+  FunctorScalarJoinFinalInit( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
 template<>
-struct FunctorScalarJoinFinalInit<1> {
-  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinFinalInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinalInit( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
+
 struct Functor1 {
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i,double& update) const {
-    update+=i;
+  void operator()( const int & i, double & update ) const {
+    update += i;
   }
 };
 
 struct Functor2 {
   typedef double value_type[];
+
   const unsigned value_count;
 
-  Functor2(unsigned n):value_count(n){}
+  Functor2( unsigned n ) : value_count( n ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const unsigned& i,double update[]) const {
-    for(unsigned j=0;j<value_count;j++)
-      update[j]+=i;
+  void operator()( const unsigned & i, double update[] ) const {
+    for ( unsigned j = 0; j < value_count; j++ ) {
+      update[j] += i;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
   void init( double dst[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] = 0;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile double dst[] ,
+  void join( volatile double dst[],
              const volatile double src[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] += src[i];
   }
 };
 
-}
-}
+} // namespace ReduceCombinatorical
+
+} // namespace Test
 
 namespace Test {
 
-template<class ExecSpace = Kokkos::DefaultExecutionSpace>
+template< class ExecSpace = Kokkos::DefaultExecutionSpace >
 struct TestReduceCombinatoricalInstantiation {
-  template<class ... Args>
-  static void CallParallelReduce(Args... args) {
-    Kokkos::parallel_reduce(args...);
+  template< class ... Args >
+  static void CallParallelReduce( Args... args ) {
+    Kokkos::parallel_reduce( args... );
   }
 
-  template<class ... Args>
-  static void AddReturnArgument(Args... args) {
-    Kokkos::View<double,Kokkos::HostSpace> result_view("ResultView");
-    double expected_result = 1000.0*999.0/2.0;
+  template< class ... Args >
+  static void AddReturnArgument( Args... args ) {
+    Kokkos::View< double, Kokkos::HostSpace > result_view( "ResultView" );
+    double expected_result = 1000.0 * 999.0 / 2.0;
 
     double value = 0;
-    Kokkos::parallel_reduce(args...,value);
-    ASSERT_EQ(expected_result,value);
+    Kokkos::parallel_reduce( args..., value );
+    ASSERT_EQ( expected_result, value );
 
     result_view() = 0;
-    CallParallelReduce(args...,result_view);
-    ASSERT_EQ(expected_result,result_view());
+    CallParallelReduce( args..., result_view );
+    ASSERT_EQ( expected_result, result_view() );
 
     value = 0;
-    CallParallelReduce(args...,Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&value));
-    ASSERT_EQ(expected_result,value);
+    CallParallelReduce( args..., Kokkos::View< double, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >( &value ) );
+    ASSERT_EQ( expected_result, value );
 
     result_view() = 0;
-    const Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> result_view_const_um = result_view;
-    CallParallelReduce(args...,result_view_const_um);
-    ASSERT_EQ(expected_result,result_view_const_um());
+    const Kokkos::View< double, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_const_um = result_view;
+    CallParallelReduce( args..., result_view_const_um );
+    ASSERT_EQ( expected_result, result_view_const_um() );
 
     value = 0;
-    CallParallelReduce(args...,Test::ReduceCombinatorical::AddPlus<double>(value));
-    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<value);
-    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<=value);
-    else
-      ASSERT_EQ(expected_result,value);
+    CallParallelReduce( args..., Test::ReduceCombinatorical::AddPlus< double >( value ) );
+    if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) && ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result < value );
+    }
+    else if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) || ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result <= value );
+    }
+    else {
+      ASSERT_EQ( expected_result, value );
+    }
 
     value = 0;
-    Test::ReduceCombinatorical::AddPlus<double> add(value);
-    CallParallelReduce(args...,add);
-    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<value);
-    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<=value);
-    else
-      ASSERT_EQ(expected_result,value);
+    Test::ReduceCombinatorical::AddPlus< double > add( value );
+    CallParallelReduce( args..., add );
+    if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) && ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result < value );
+    }
+    else if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) || ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result <= value );
+    }
+    else {
+      ASSERT_EQ( expected_result, value );
+    }
   }
 
-
-  template<class ... Args>
-  static void AddLambdaRange(void*,Args... args) {
-    AddReturnArgument(args...,  KOKKOS_LAMBDA (const int&i , double& lsum) {
+  template< class ... Args >
+  static void AddLambdaRange( void*, Args... args ) {
+    AddReturnArgument( args..., KOKKOS_LAMBDA ( const int & i, double & lsum ) {
       lsum += i;
     });
   }
 
-  template<class ... Args>
-  static void AddLambdaTeam(void*,Args... args) {
-    AddReturnArgument(args..., KOKKOS_LAMBDA (const Kokkos::TeamPolicy<>::member_type& team, double& update) {
-      update+=1.0/team.team_size()*team.league_rank();
+  template< class ... Args >
+  static void AddLambdaTeam( void*, Args... args ) {
+    AddReturnArgument( args..., KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type & team, double & update ) {
+      update += 1.0 / team.team_size() * team.league_rank();
     });
   }
 
-  template<class ... Args>
-  static void AddLambdaRange(Kokkos::InvalidType,Args... args) {
-  }
+  template< class ... Args >
+  static void AddLambdaRange( Kokkos::InvalidType, Args... args ) {}
 
-  template<class ... Args>
-  static void AddLambdaTeam(Kokkos::InvalidType,Args... args) {
-  }
+  template< class ... Args >
+  static void AddLambdaTeam( Kokkos::InvalidType, Args... args ) {}
 
-  template<int ISTEAM, class ... Args>
-  static void AddFunctor(Args... args) {
-    Kokkos::View<double> result_view("FunctorView");
-    auto h_r = Kokkos::create_mirror_view(result_view);
-    Test::ReduceCombinatorical::FunctorScalar<ISTEAM> functor(result_view);
-    double expected_result = 1000.0*999.0/2.0;
+  template< int ISTEAM, class ... Args >
+  static void AddFunctor( Args... args ) {
+    Kokkos::View< double > result_view( "FunctorView" );
+    auto h_r = Kokkos::create_mirror_view( result_view );
+    Test::ReduceCombinatorical::FunctorScalar< ISTEAM > functor( result_view );
+    double expected_result = 1000.0 * 999.0 / 2.0;
 
-    AddReturnArgument(args..., functor);
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalar<ISTEAM>(result_view));
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarInit<ISTEAM>(result_view));
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoin<ISTEAM>(result_view));
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoinInit<ISTEAM>(result_view));
+    AddReturnArgument( args..., functor );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalar< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarInit< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarJoin< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarJoinInit< ISTEAM >( result_view ) );
 
     h_r() = 0;
-    Kokkos::deep_copy(result_view,h_r);
-    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarFinal<ISTEAM>(result_view));
-    Kokkos::deep_copy(h_r,result_view);
-    ASSERT_EQ(expected_result,h_r());
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarFinal< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
 
     h_r() = 0;
-    Kokkos::deep_copy(result_view,h_r);
-    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal<ISTEAM>(result_view));
-    Kokkos::deep_copy(h_r,result_view);
-    ASSERT_EQ(expected_result,h_r());
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
 
     h_r() = 0;
-    Kokkos::deep_copy(result_view,h_r);
-    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit<ISTEAM>(result_view));
-    Kokkos::deep_copy(h_r,result_view);
-    ASSERT_EQ(expected_result,h_r());
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
   }
 
-  template<class ... Args>
-  static void AddFunctorLambdaRange(Args... args) {
-    AddFunctor<0,Args...>(args...);
-    #ifdef  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-    AddLambdaRange(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
-    #endif
+  template< class ... Args >
+  static void AddFunctorLambdaRange( Args... args ) {
+    AddFunctor< 0, Args... >( args... );
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    AddLambdaRange( typename std::conditional< std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, void*, Kokkos::InvalidType >::type(), args... );
+#endif
   }
 
-  template<class ... Args>
-  static void AddFunctorLambdaTeam(Args... args) {
-    AddFunctor<1,Args...>(args...);
-    #ifdef  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-    AddLambdaTeam(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
-    #endif
+  template< class ... Args >
+  static void AddFunctorLambdaTeam( Args... args ) {
+    AddFunctor< 1, Args... >( args... );
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    AddLambdaTeam( typename std::conditional< std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, void*, Kokkos::InvalidType >::type(), args... );
+#endif
   }
 
-  template<class ... Args>
-  static void AddPolicy(Args... args) {
+  template< class ... Args >
+  static void AddPolicy( Args... args ) {
     int N = 1000;
-    Kokkos::RangePolicy<ExecSpace> policy(0,N);
+    Kokkos::RangePolicy< ExecSpace > policy( 0, N );
 
-    AddFunctorLambdaRange(args...,1000);
-    AddFunctorLambdaRange(args...,N);
-    AddFunctorLambdaRange(args...,policy);
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace>(0,N));
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N));
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(0,N).set_chunk_size(10));
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N).set_chunk_size(10));
+    AddFunctorLambdaRange( args..., 1000 );
+    AddFunctorLambdaRange( args..., N );
+    AddFunctorLambdaRange( args..., policy );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace >( 0, N ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( 0, N ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Static> >( 0, N ).set_chunk_size( 10 ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( 0, N ).set_chunk_size( 10 ) );
 
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace>(N,Kokkos::AUTO));
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO));
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(N,Kokkos::AUTO).set_chunk_size(10));
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO).set_chunk_size(10));
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace >( N, Kokkos::AUTO ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( N, Kokkos::AUTO ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Static> >( N, Kokkos::AUTO ).set_chunk_size( 10 ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( N, Kokkos::AUTO ).set_chunk_size( 10 ) );
   }
 
-
   static void execute_a() {
     AddPolicy();
   }
 
   static void execute_b() {
-    std::string s("Std::String");
-    AddPolicy(s.c_str());
-    AddPolicy("Char Constant");
+    std::string s( "Std::String" );
+    AddPolicy( s.c_str() );
+    AddPolicy( "Char Constant" );
   }
 
   static void execute_c() {
-    std::string s("Std::String");
-    AddPolicy(s);
+    std::string s( "Std::String" );
+    AddPolicy( s );
   }
 };
 
-template<class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace>
+template< class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace >
 struct TestReducers {
-
   struct SumFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value += values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value += values( i );
     }
   };
 
   struct ProdFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value *= values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value *= values( i );
     }
   };
 
   struct MinFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      if(values(i) < value)
-        value = values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      if ( values( i ) < value ) value = values( i );
     }
   };
 
   struct MaxFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      if(values(i) > value)
-        value = values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      if ( values( i ) > value ) value = values( i );
     }
   };
 
   struct MinLocFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i,
-        typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type& value) const {
-      if(values(i) < value.val) {
-        value.val = values(i);
+    void operator()( const int & i, typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) < value.val ) {
+        value.val = values( i );
         value.loc = i;
       }
     }
   };
 
   struct MaxLocFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i,
-        typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type& value) const {
-      if(values(i) > value.val) {
-        value.val = values(i);
+    void operator()( const int & i, typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.val ) {
+        value.val = values( i );
         value.loc = i;
       }
     }
   };
 
   struct MinMaxLocFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i,
-        typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type& value) const {
-      if(values(i) > value.max_val) {
-        value.max_val = values(i);
+    void operator()( const int & i, typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.max_val ) {
+        value.max_val = values( i );
         value.max_loc = i;
       }
-      if(values(i) < value.min_val) {
-        value.min_val = values(i);
+
+      if ( values( i ) < value.min_val ) {
+        value.min_val = values( i );
         value.min_loc = i;
       }
     }
   };
 
   struct BAndFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value & values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value & values( i );
     }
   };
 
   struct BOrFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value | values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value | values( i );
     }
   };
 
   struct BXorFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value ^ values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value ^ values( i );
     }
   };
 
   struct LAndFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value && values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value && values( i );
     }
   };
 
   struct LOrFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value || values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value || values( i );
     }
   };
 
   struct LXorFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value ? (!values(i)) : values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value ? ( !values( i ) ) : values( i );
     }
   };
 
-  static void test_sum(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_sum( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_sum = 0;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100);
-      reference_sum += h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100 );
+      reference_sum += h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     SumFunctor f;
     f.values = values;
@@ -1224,556 +1254,669 @@ struct TestReducers {
 
     {
       Scalar sum_scalar = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_scalar(sum_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(sum_scalar,reference_sum);
+      Kokkos::Experimental::Sum< Scalar > reducer_scalar( sum_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( sum_scalar, reference_sum );
+
       Scalar sum_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(sum_scalar_view,reference_sum);
+      ASSERT_EQ( sum_scalar_view, reference_sum );
     }
+
     {
       Scalar sum_scalar_init = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_scalar_init(sum_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(sum_scalar_init,reference_sum);
+      Kokkos::Experimental::Sum< Scalar > reducer_scalar_init( sum_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( sum_scalar_init, reference_sum );
+
       Scalar sum_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(sum_scalar_init_view,reference_sum);
+      ASSERT_EQ( sum_scalar_init_view, reference_sum );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace> sum_view( "View" );
       sum_view() = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_view(sum_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Sum< Scalar > reducer_view( sum_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar sum_view_scalar = sum_view();
-      ASSERT_EQ(sum_view_scalar,reference_sum);
+      ASSERT_EQ( sum_view_scalar, reference_sum );
+
       Scalar sum_view_view = reducer_view.result_view()();
-      ASSERT_EQ(sum_view_view,reference_sum);
+      ASSERT_EQ( sum_view_view, reference_sum );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > sum_view_init( "View" );
       sum_view_init() = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_view_init(sum_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Sum< Scalar > reducer_view_init( sum_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar sum_view_init_scalar = sum_view_init();
-      ASSERT_EQ(sum_view_init_scalar,reference_sum);
+      ASSERT_EQ( sum_view_init_scalar, reference_sum );
+
       Scalar sum_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(sum_view_init_view,reference_sum);
+      ASSERT_EQ( sum_view_init_view, reference_sum );
     }
   }
 
-  static void test_prod(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_prod( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_prod = 1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%4+1);
-      reference_prod *= h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 4 + 1 );
+      reference_prod *= h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     ProdFunctor f;
     f.values = values;
     Scalar init = 1;
 
-    if(std::is_arithmetic<Scalar>::value)
+    if ( std::is_arithmetic< Scalar >::value )
     {
       Scalar prod_scalar = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_scalar(prod_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(prod_scalar,reference_prod);
+      Kokkos::Experimental::Prod< Scalar > reducer_scalar( prod_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( prod_scalar, reference_prod );
+
       Scalar prod_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(prod_scalar_view,reference_prod);
+      ASSERT_EQ( prod_scalar_view, reference_prod );
     }
+
     {
       Scalar prod_scalar_init = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_scalar_init(prod_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(prod_scalar_init,reference_prod);
+      Kokkos::Experimental::Prod< Scalar > reducer_scalar_init( prod_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( prod_scalar_init, reference_prod );
+
       Scalar prod_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(prod_scalar_init_view,reference_prod);
+      ASSERT_EQ( prod_scalar_init_view, reference_prod );
     }
 
-    if(std::is_arithmetic<Scalar>::value)
+    if ( std::is_arithmetic< Scalar >::value )
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > prod_view( "View" );
       prod_view() = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_view(prod_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Prod< Scalar > reducer_view( prod_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar prod_view_scalar = prod_view();
-      ASSERT_EQ(prod_view_scalar,reference_prod);
+      ASSERT_EQ( prod_view_scalar, reference_prod );
+
       Scalar prod_view_view = reducer_view.result_view()();
-      ASSERT_EQ(prod_view_view,reference_prod);
+      ASSERT_EQ( prod_view_view, reference_prod );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > prod_view_init( "View" );
       prod_view_init() = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_view_init(prod_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Prod< Scalar > reducer_view_init( prod_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar prod_view_init_scalar = prod_view_init();
-      ASSERT_EQ(prod_view_init_scalar,reference_prod);
+      ASSERT_EQ( prod_view_init_scalar, reference_prod );
+
       Scalar prod_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(prod_view_init_view,reference_prod);
+      ASSERT_EQ( prod_view_init_view, reference_prod );
     }
   }
 
-  static void test_min(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_min = std::numeric_limits<Scalar>::max();
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000);
-      if(h_values(i)<reference_min)
-        reference_min = h_values(i);
+  static void test_min( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_min = std::numeric_limits< Scalar >::max();
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) < reference_min ) reference_min = h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MinFunctor f;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::max();
+    Scalar init = std::numeric_limits< Scalar >::max();
 
     {
       Scalar min_scalar = init;
-      Kokkos::Experimental::Min<Scalar> reducer_scalar(min_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(min_scalar,reference_min);
+      Kokkos::Experimental::Min< Scalar > reducer_scalar( min_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( min_scalar, reference_min );
+
       Scalar min_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(min_scalar_view,reference_min);
+      ASSERT_EQ( min_scalar_view, reference_min );
     }
+
     {
       Scalar min_scalar_init = init;
-      Kokkos::Experimental::Min<Scalar> reducer_scalar_init(min_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(min_scalar_init,reference_min);
+      Kokkos::Experimental::Min< Scalar > reducer_scalar_init( min_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( min_scalar_init, reference_min );
+
       Scalar min_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(min_scalar_init_view,reference_min);
+      ASSERT_EQ( min_scalar_init_view, reference_min );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> min_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > min_view( "View" );
       min_view() = init;
-      Kokkos::Experimental::Min<Scalar> reducer_view(min_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Min< Scalar > reducer_view( min_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar min_view_scalar = min_view();
-      ASSERT_EQ(min_view_scalar,reference_min);
+      ASSERT_EQ( min_view_scalar, reference_min );
+
       Scalar min_view_view = reducer_view.result_view()();
-      ASSERT_EQ(min_view_view,reference_min);
+      ASSERT_EQ( min_view_view, reference_min );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> min_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > min_view_init( "View" );
       min_view_init() = init;
-      Kokkos::Experimental::Min<Scalar> reducer_view_init(min_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Min< Scalar > reducer_view_init( min_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar min_view_init_scalar = min_view_init();
-      ASSERT_EQ(min_view_init_scalar,reference_min);
+      ASSERT_EQ( min_view_init_scalar, reference_min );
+
       Scalar min_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(min_view_init_view,reference_min);
+      ASSERT_EQ( min_view_init_view, reference_min );
     }
   }
 
-  static void test_max(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_max = std::numeric_limits<Scalar>::min();
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000+1);
-      if(h_values(i)>reference_max)
-        reference_max = h_values(i);
+  static void test_max( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_max = std::numeric_limits< Scalar >::min();
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 + 1 );
+
+      if ( h_values( i ) > reference_max ) reference_max = h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MaxFunctor f;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::min();
+    Scalar init = std::numeric_limits< Scalar >::min();
 
     {
       Scalar max_scalar = init;
-      Kokkos::Experimental::Max<Scalar> reducer_scalar(max_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(max_scalar,reference_max);
+      Kokkos::Experimental::Max< Scalar > reducer_scalar( max_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( max_scalar, reference_max );
+
       Scalar max_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(max_scalar_view,reference_max);
+      ASSERT_EQ( max_scalar_view, reference_max );
     }
+
     {
       Scalar max_scalar_init = init;
-      Kokkos::Experimental::Max<Scalar> reducer_scalar_init(max_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(max_scalar_init,reference_max);
+      Kokkos::Experimental::Max< Scalar > reducer_scalar_init( max_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( max_scalar_init, reference_max );
+
       Scalar max_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(max_scalar_init_view,reference_max);
+      ASSERT_EQ( max_scalar_init_view, reference_max );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> max_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > max_view( "View" );
       max_view() = init;
-      Kokkos::Experimental::Max<Scalar> reducer_view(max_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Max< Scalar > reducer_view( max_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar max_view_scalar = max_view();
-      ASSERT_EQ(max_view_scalar,reference_max);
+      ASSERT_EQ( max_view_scalar, reference_max );
+
       Scalar max_view_view = reducer_view.result_view()();
-      ASSERT_EQ(max_view_view,reference_max);
+      ASSERT_EQ( max_view_view, reference_max );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> max_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > max_view_init( "View" );
       max_view_init() = init;
-      Kokkos::Experimental::Max<Scalar> reducer_view_init(max_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Max< Scalar > reducer_view_init( max_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar max_view_init_scalar = max_view_init();
-      ASSERT_EQ(max_view_init_scalar,reference_max);
+      ASSERT_EQ( max_view_init_scalar, reference_max );
+
       Scalar max_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(max_view_init_view,reference_max);
+      ASSERT_EQ( max_view_init_view, reference_max );
     }
   }
 
-  static void test_minloc(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_min = std::numeric_limits<Scalar>::max();
+  static void test_minloc( int N ) {
+    typedef typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type value_type;
+
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_min = std::numeric_limits< Scalar >::max();
     int reference_loc = -1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000);
-      if(h_values(i)<reference_min) {
-        reference_min = h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) < reference_min ) {
+        reference_min = h_values( i );
         reference_loc = i;
-      } else if (h_values(i) == reference_min) {
-        // make min unique
-        h_values(i) += std::numeric_limits<Scalar>::epsilon();
+      }
+      else if ( h_values( i ) == reference_min ) {
+        // Make min unique.
+        h_values( i ) += std::numeric_limits< Scalar >::epsilon();
       }
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MinLocFunctor f;
-    typedef typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type value_type;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::max();
-
+    Scalar init = std::numeric_limits< Scalar >::max();
 
     {
       value_type min_scalar;
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar(min_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(min_scalar.val,reference_min);
-      ASSERT_EQ(min_scalar.loc,reference_loc);
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_scalar( min_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( min_scalar.val, reference_min );
+      ASSERT_EQ( min_scalar.loc, reference_loc );
+
       value_type min_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(min_scalar_view.val,reference_min);
-      ASSERT_EQ(min_scalar_view.loc,reference_loc);
+      ASSERT_EQ( min_scalar_view.val, reference_min );
+      ASSERT_EQ( min_scalar_view.loc, reference_loc );
     }
+
     {
       value_type min_scalar_init;
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar_init(min_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(min_scalar_init.val,reference_min);
-      ASSERT_EQ(min_scalar_init.loc,reference_loc);
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_scalar_init( min_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( min_scalar_init.val, reference_min );
+      ASSERT_EQ( min_scalar_init.loc, reference_loc );
+
       value_type min_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(min_scalar_init_view.val,reference_min);
-      ASSERT_EQ(min_scalar_init_view.loc,reference_loc);
+      ASSERT_EQ( min_scalar_init_view.val, reference_min );
+      ASSERT_EQ( min_scalar_init_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> min_view("View");
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view(min_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::View< value_type, Kokkos::HostSpace > min_view( "View" );
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_view( min_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       value_type min_view_scalar = min_view();
-      ASSERT_EQ(min_view_scalar.val,reference_min);
-      ASSERT_EQ(min_view_scalar.loc,reference_loc);
+      ASSERT_EQ( min_view_scalar.val, reference_min );
+      ASSERT_EQ( min_view_scalar.loc, reference_loc );
+
       value_type min_view_view = reducer_view.result_view()();
-      ASSERT_EQ(min_view_view.val,reference_min);
-      ASSERT_EQ(min_view_view.loc,reference_loc);
+      ASSERT_EQ( min_view_view.val, reference_min );
+      ASSERT_EQ( min_view_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> min_view_init("View");
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view_init(min_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::View< value_type, Kokkos::HostSpace > min_view_init( "View" );
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_view_init( min_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       value_type min_view_init_scalar = min_view_init();
-      ASSERT_EQ(min_view_init_scalar.val,reference_min);
-      ASSERT_EQ(min_view_init_scalar.loc,reference_loc);
+      ASSERT_EQ( min_view_init_scalar.val, reference_min );
+      ASSERT_EQ( min_view_init_scalar.loc, reference_loc );
+
       value_type min_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(min_view_init_view.val,reference_min);
-      ASSERT_EQ(min_view_init_view.loc,reference_loc);
+      ASSERT_EQ( min_view_init_view.val, reference_min );
+      ASSERT_EQ( min_view_init_view.loc, reference_loc );
     }
   }
 
-  static void test_maxloc(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_max = std::numeric_limits<Scalar>::min();
+  static void test_maxloc( int N ) {
+    typedef typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type value_type;
+
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_max = std::numeric_limits< Scalar >::min();
     int reference_loc = -1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000);
-      if(h_values(i)>reference_max) {
-        reference_max = h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) > reference_max ) {
+        reference_max = h_values( i );
         reference_loc = i;
-      } else if (h_values(i) == reference_max) {
-        // make max unique
-        h_values(i) -= std::numeric_limits<Scalar>::epsilon();
+      }
+      else if ( h_values( i ) == reference_max ) {
+        // Make max unique.
+        h_values( i ) -= std::numeric_limits< Scalar >::epsilon();
       }
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MaxLocFunctor f;
-    typedef typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type value_type;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::min();
-
+    Scalar init = std::numeric_limits< Scalar >::min();
 
     {
       value_type max_scalar;
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar(max_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(max_scalar.val,reference_max);
-      ASSERT_EQ(max_scalar.loc,reference_loc);
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_scalar( max_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( max_scalar.val, reference_max );
+      ASSERT_EQ( max_scalar.loc, reference_loc );
+
       value_type max_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(max_scalar_view.val,reference_max);
-      ASSERT_EQ(max_scalar_view.loc,reference_loc);
+      ASSERT_EQ( max_scalar_view.val, reference_max );
+      ASSERT_EQ( max_scalar_view.loc, reference_loc );
     }
+
     {
       value_type max_scalar_init;
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar_init(max_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(max_scalar_init.val,reference_max);
-      ASSERT_EQ(max_scalar_init.loc,reference_loc);
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_scalar_init( max_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( max_scalar_init.val, reference_max );
+      ASSERT_EQ( max_scalar_init.loc, reference_loc );
+
       value_type max_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(max_scalar_init_view.val,reference_max);
-      ASSERT_EQ(max_scalar_init_view.loc,reference_loc);
+      ASSERT_EQ( max_scalar_init_view.val, reference_max );
+      ASSERT_EQ( max_scalar_init_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> max_view("View");
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view(max_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::View< value_type, Kokkos::HostSpace > max_view( "View" );
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_view( max_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       value_type max_view_scalar = max_view();
-      ASSERT_EQ(max_view_scalar.val,reference_max);
-      ASSERT_EQ(max_view_scalar.loc,reference_loc);
+      ASSERT_EQ( max_view_scalar.val, reference_max );
+      ASSERT_EQ( max_view_scalar.loc, reference_loc );
+
       value_type max_view_view = reducer_view.result_view()();
-      ASSERT_EQ(max_view_view.val,reference_max);
-      ASSERT_EQ(max_view_view.loc,reference_loc);
+      ASSERT_EQ( max_view_view.val, reference_max );
+      ASSERT_EQ( max_view_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> max_view_init("View");
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view_init(max_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::View< value_type, Kokkos::HostSpace > max_view_init( "View" );
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_view_init( max_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       value_type max_view_init_scalar = max_view_init();
-      ASSERT_EQ(max_view_init_scalar.val,reference_max);
-      ASSERT_EQ(max_view_init_scalar.loc,reference_loc);
+      ASSERT_EQ( max_view_init_scalar.val, reference_max );
+      ASSERT_EQ( max_view_init_scalar.loc, reference_loc );
+
       value_type max_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(max_view_init_view.val,reference_max);
-      ASSERT_EQ(max_view_init_view.loc,reference_loc);
+      ASSERT_EQ( max_view_init_view.val, reference_max );
+      ASSERT_EQ( max_view_init_view.loc, reference_loc );
     }
   }
 
-  static void test_minmaxloc(int N) {
-     Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-     auto h_values = Kokkos::create_mirror_view(values);
-     Scalar reference_max = std::numeric_limits<Scalar>::min();
-     Scalar reference_min = std::numeric_limits<Scalar>::max();
+  static void test_minmaxloc( int N ) {
+     typedef typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type value_type;
+
+     Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+     auto h_values = Kokkos::create_mirror_view( values );
+     Scalar reference_max = std::numeric_limits< Scalar >::min();
+     Scalar reference_min = std::numeric_limits< Scalar >::max();
      int reference_minloc = -1;
      int reference_maxloc = -1;
-     for(int i=0; i<N; i++) {
-       h_values(i) = (Scalar)(rand()%100000);
+
+     for ( int i = 0; i < N; i++ ) {
+       h_values( i ) = (Scalar) ( rand() % 100000 );
      }
-     for(int i=0; i<N; i++) {
-       if(h_values(i)>reference_max) {
-         reference_max = h_values(i);
+
+     for ( int i = 0; i < N; i++ ) {
+       if ( h_values( i ) > reference_max ) {
+         reference_max = h_values( i );
          reference_maxloc = i;
-       } else if (h_values(i) == reference_max) {
-         // make max unique
-         h_values(i) -= std::numeric_limits<Scalar>::epsilon();
+       }
+       else if ( h_values( i ) == reference_max ) {
+         // Make max unique.
+         h_values( i ) -= std::numeric_limits< Scalar >::epsilon();
        }
      }
-     for(int i=0; i<N; i++) {
-       if(h_values(i)<reference_min) {
-         reference_min = h_values(i);
+
+     for ( int i = 0; i < N; i++ ) {
+       if ( h_values( i ) < reference_min ) {
+         reference_min = h_values( i );
          reference_minloc = i;
-       } else if (h_values(i) == reference_min) {
-         // make min unique
-         h_values(i) += std::numeric_limits<Scalar>::epsilon();
+       }
+       else if ( h_values( i ) == reference_min ) {
+         // Make min unique.
+         h_values( i ) += std::numeric_limits< Scalar >::epsilon();
        }
      }
-     Kokkos::deep_copy(values,h_values);
+
+     Kokkos::deep_copy( values, h_values );
 
      MinMaxLocFunctor f;
-     typedef typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type value_type;
      f.values = values;
-     Scalar init_min = std::numeric_limits<Scalar>::max();
-     Scalar init_max = std::numeric_limits<Scalar>::min();
-
+     Scalar init_min = std::numeric_limits< Scalar >::max();
+     Scalar init_max = std::numeric_limits< Scalar >::min();
 
      {
        value_type minmax_scalar;
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar(minmax_scalar);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-       ASSERT_EQ(minmax_scalar.min_val,reference_min);
-       for(int i=0; i<N; i++) {
-         if((i == minmax_scalar.min_loc) && (h_values(i)==reference_min))
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_scalar( minmax_scalar );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+       ASSERT_EQ( minmax_scalar.min_val, reference_min );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.min_loc ) && ( h_values( i ) == reference_min ) ) {
            reference_minloc = i;
+         }
        }
-       ASSERT_EQ(minmax_scalar.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar.max_val,reference_max);
-       for(int i=0; i<N; i++) {
-         if((i == minmax_scalar.max_loc) && (h_values(i)==reference_max))
+
+       ASSERT_EQ( minmax_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar.max_val, reference_max );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.max_loc ) && ( h_values( i ) == reference_max ) ) {
            reference_maxloc = i;
+         }
        }
-       ASSERT_EQ(minmax_scalar.max_loc,reference_maxloc);
+
+       ASSERT_EQ( minmax_scalar.max_loc, reference_maxloc );
+
        value_type minmax_scalar_view = reducer_scalar.result_view()();
-       ASSERT_EQ(minmax_scalar_view.min_val,reference_min);
-       ASSERT_EQ(minmax_scalar_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar_view.max_val,reference_max);
-       ASSERT_EQ(minmax_scalar_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_scalar_view.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_view.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_view.max_loc, reference_maxloc );
      }
+
      {
        value_type minmax_scalar_init;
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar_init(minmax_scalar_init,init_min,init_max);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-       ASSERT_EQ(minmax_scalar_init.min_val,reference_min);
-       ASSERT_EQ(minmax_scalar_init.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar_init.max_val,reference_max);
-       ASSERT_EQ(minmax_scalar_init.max_loc,reference_maxloc);
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_scalar_init( minmax_scalar_init, init_min, init_max );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+       ASSERT_EQ( minmax_scalar_init.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_init.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_init.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_init.max_loc, reference_maxloc );
+
        value_type minmax_scalar_init_view = reducer_scalar_init.result_view()();
-       ASSERT_EQ(minmax_scalar_init_view.min_val,reference_min);
-       ASSERT_EQ(minmax_scalar_init_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar_init_view.max_val,reference_max);
-       ASSERT_EQ(minmax_scalar_init_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_scalar_init_view.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_init_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_init_view.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_init_view.max_loc, reference_maxloc );
      }
+
      {
-       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view("View");
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view(minmax_view);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+       Kokkos::View< value_type, Kokkos::HostSpace > minmax_view( "View" );
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_view( minmax_view );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
        value_type minmax_view_scalar = minmax_view();
-       ASSERT_EQ(minmax_view_scalar.min_val,reference_min);
-       ASSERT_EQ(minmax_view_scalar.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_scalar.max_val,reference_max);
-       ASSERT_EQ(minmax_view_scalar.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_scalar.min_val, reference_min );
+       ASSERT_EQ( minmax_view_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_scalar.max_val, reference_max );
+       ASSERT_EQ( minmax_view_scalar.max_loc, reference_maxloc );
+
        value_type minmax_view_view = reducer_view.result_view()();
-       ASSERT_EQ(minmax_view_view.min_val,reference_min);
-       ASSERT_EQ(minmax_view_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_view.max_val,reference_max);
-       ASSERT_EQ(minmax_view_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_view.min_val, reference_min );
+       ASSERT_EQ( minmax_view_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_view.max_val, reference_max );
+       ASSERT_EQ( minmax_view_view.max_loc, reference_maxloc );
      }
+
      {
-       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view_init("View");
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view_init(minmax_view_init,init_min,init_max);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+       Kokkos::View< value_type, Kokkos::HostSpace > minmax_view_init( "View" );
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_view_init( minmax_view_init, init_min, init_max );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
        value_type minmax_view_init_scalar = minmax_view_init();
-       ASSERT_EQ(minmax_view_init_scalar.min_val,reference_min);
-       ASSERT_EQ(minmax_view_init_scalar.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_init_scalar.max_val,reference_max);
-       ASSERT_EQ(minmax_view_init_scalar.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_init_scalar.min_val, reference_min );
+       ASSERT_EQ( minmax_view_init_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_init_scalar.max_val, reference_max );
+       ASSERT_EQ( minmax_view_init_scalar.max_loc, reference_maxloc );
+
        value_type minmax_view_init_view = reducer_view_init.result_view()();
-       ASSERT_EQ(minmax_view_init_view.min_val,reference_min);
-       ASSERT_EQ(minmax_view_init_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_init_view.max_val,reference_max);
-       ASSERT_EQ(minmax_view_init_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_init_view.min_val, reference_min );
+       ASSERT_EQ( minmax_view_init_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_init_view.max_val, reference_max );
+       ASSERT_EQ( minmax_view_init_view.max_loc, reference_maxloc );
      }
    }
 
-  static void test_BAnd(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_band = Scalar() | (~Scalar());
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000+1);
-      reference_band = reference_band & h_values(i);
+  static void test_BAnd( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_band = Scalar() | ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 + 1 );
+      reference_band = reference_band & h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     BAndFunctor f;
     f.values = values;
-    Scalar init = Scalar() | (~Scalar());
+    Scalar init = Scalar() | ( ~Scalar() );
 
     {
       Scalar band_scalar = init;
-      Kokkos::Experimental::BAnd<Scalar> reducer_scalar(band_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(band_scalar,reference_band);
+      Kokkos::Experimental::BAnd< Scalar > reducer_scalar( band_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( band_scalar, reference_band );
       Scalar band_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(band_scalar_view,reference_band);
+
+      ASSERT_EQ( band_scalar_view, reference_band );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> band_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > band_view( "View" );
       band_view() = init;
-      Kokkos::Experimental::BAnd<Scalar> reducer_view(band_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::BAnd< Scalar > reducer_view( band_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar band_view_scalar = band_view();
-      ASSERT_EQ(band_view_scalar,reference_band);
+      ASSERT_EQ( band_view_scalar, reference_band );
+
       Scalar band_view_view = reducer_view.result_view()();
-      ASSERT_EQ(band_view_view,reference_band);
+      ASSERT_EQ( band_view_view, reference_band );
     }
   }
 
-  static void test_BOr(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_bor = Scalar() & (~Scalar());
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)((rand()%100000+1)*2);
-      reference_bor = reference_bor | h_values(i);
+  static void test_BOr( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_bor = Scalar() & ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( ( rand() % 100000 + 1 ) * 2 );
+      reference_bor = reference_bor | h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     BOrFunctor f;
     f.values = values;
-    Scalar init = Scalar() & (~Scalar());
+    Scalar init = Scalar() & ( ~Scalar() );
 
     {
       Scalar bor_scalar = init;
-      Kokkos::Experimental::BOr<Scalar> reducer_scalar(bor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(bor_scalar,reference_bor);
+      Kokkos::Experimental::BOr< Scalar > reducer_scalar( bor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( bor_scalar, reference_bor );
+
       Scalar bor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(bor_scalar_view,reference_bor);
+      ASSERT_EQ( bor_scalar_view, reference_bor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> bor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > bor_view( "View" );
       bor_view() = init;
-      Kokkos::Experimental::BOr<Scalar> reducer_view(bor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::BOr< Scalar > reducer_view( bor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar bor_view_scalar = bor_view();
-      ASSERT_EQ(bor_view_scalar,reference_bor);
+      ASSERT_EQ( bor_view_scalar, reference_bor );
+
       Scalar bor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(bor_view_view,reference_bor);
+      ASSERT_EQ( bor_view_view, reference_bor );
     }
   }
 
-  static void test_BXor(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_bxor = Scalar() & (~Scalar());
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)((rand()%100000+1)*2);
-      reference_bxor = reference_bxor ^ h_values(i);
+  static void test_BXor( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_bxor = Scalar() & ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( ( rand() % 100000 + 1 ) * 2 );
+      reference_bxor = reference_bxor ^ h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     BXorFunctor f;
     f.values = values;
-    Scalar init = Scalar() & (~Scalar());
+    Scalar init = Scalar() & ( ~Scalar() );
 
     {
       Scalar bxor_scalar = init;
-      Kokkos::Experimental::BXor<Scalar> reducer_scalar(bxor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(bxor_scalar,reference_bxor);
+      Kokkos::Experimental::BXor< Scalar > reducer_scalar( bxor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( bxor_scalar, reference_bxor );
+
       Scalar bxor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(bxor_scalar_view,reference_bxor);
+      ASSERT_EQ( bxor_scalar_view, reference_bxor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> bxor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > bxor_view( "View" );
       bxor_view() = init;
-      Kokkos::Experimental::BXor<Scalar> reducer_view(bxor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::BXor< Scalar > reducer_view( bxor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar bxor_view_scalar = bxor_view();
-      ASSERT_EQ(bxor_view_scalar,reference_bxor);
+      ASSERT_EQ( bxor_view_scalar, reference_bxor );
+
       Scalar bxor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(bxor_view_view,reference_bxor);
+      ASSERT_EQ( bxor_view_view, reference_bxor );
     }
   }
 
-  static void test_LAnd(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_LAnd( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_land = 1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%2);
-      reference_land = reference_land && h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_land = reference_land && h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     LAndFunctor f;
     f.values = values;
@@ -1781,34 +1924,39 @@ struct TestReducers {
 
     {
       Scalar land_scalar = init;
-      Kokkos::Experimental::LAnd<Scalar> reducer_scalar(land_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(land_scalar,reference_land);
+      Kokkos::Experimental::LAnd< Scalar > reducer_scalar( land_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( land_scalar, reference_land );
+
       Scalar land_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(land_scalar_view,reference_land);
+      ASSERT_EQ( land_scalar_view, reference_land );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> land_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > land_view( "View" );
       land_view() = init;
-      Kokkos::Experimental::LAnd<Scalar> reducer_view(land_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::LAnd< Scalar > reducer_view( land_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar land_view_scalar = land_view();
-      ASSERT_EQ(land_view_scalar,reference_land);
+      ASSERT_EQ( land_view_scalar, reference_land );
+
       Scalar land_view_view = reducer_view.result_view()();
-      ASSERT_EQ(land_view_view,reference_land);
+      ASSERT_EQ( land_view_view, reference_land );
     }
   }
 
-  static void test_LOr(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_LOr( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_lor = 0;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%2);
-      reference_lor = reference_lor || h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_lor = reference_lor || h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     LOrFunctor f;
     f.values = values;
@@ -1816,34 +1964,39 @@ struct TestReducers {
 
     {
       Scalar lor_scalar = init;
-      Kokkos::Experimental::LOr<Scalar> reducer_scalar(lor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(lor_scalar,reference_lor);
+      Kokkos::Experimental::LOr< Scalar > reducer_scalar( lor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( lor_scalar, reference_lor );
+
       Scalar lor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(lor_scalar_view,reference_lor);
+      ASSERT_EQ( lor_scalar_view, reference_lor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> lor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > lor_view( "View" );
       lor_view() = init;
-      Kokkos::Experimental::LOr<Scalar> reducer_view(lor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::LOr< Scalar > reducer_view( lor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar lor_view_scalar = lor_view();
-      ASSERT_EQ(lor_view_scalar,reference_lor);
+      ASSERT_EQ( lor_view_scalar, reference_lor );
+
       Scalar lor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(lor_view_view,reference_lor);
+      ASSERT_EQ( lor_view_view, reference_lor );
     }
   }
 
-  static void test_LXor(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_LXor( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_lxor = 0;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%2);
-      reference_lxor = reference_lxor ? (!h_values(i)) : h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_lxor = reference_lxor ? ( !h_values( i ) ) : h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     LXorFunctor f;
     f.values = values;
@@ -1851,57 +2004,59 @@ struct TestReducers {
 
     {
       Scalar lxor_scalar = init;
-      Kokkos::Experimental::LXor<Scalar> reducer_scalar(lxor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(lxor_scalar,reference_lxor);
+      Kokkos::Experimental::LXor< Scalar > reducer_scalar( lxor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( lxor_scalar, reference_lxor );
+
       Scalar lxor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(lxor_scalar_view,reference_lxor);
+      ASSERT_EQ( lxor_scalar_view, reference_lxor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> lxor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > lxor_view( "View" );
       lxor_view() = init;
-      Kokkos::Experimental::LXor<Scalar> reducer_view(lxor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::LXor< Scalar > reducer_view( lxor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar lxor_view_scalar = lxor_view();
-      ASSERT_EQ(lxor_view_scalar,reference_lxor);
+      ASSERT_EQ( lxor_view_scalar, reference_lxor );
+
       Scalar lxor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(lxor_view_view,reference_lxor);
+      ASSERT_EQ( lxor_view_view, reference_lxor );
     }
   }
 
   static void execute_float() {
-    test_sum(10001);
-    test_prod(35);
-    test_min(10003);
-    test_minloc(10003);
-    test_max(10007);
-    test_maxloc(10007);
-    test_minmaxloc(10007);
+    test_sum( 10001 );
+    test_prod( 35 );
+    test_min( 10003 );
+    test_minloc( 10003 );
+    test_max( 10007 );
+    test_maxloc( 10007 );
+    test_minmaxloc( 10007 );
   }
 
   static void execute_integer() {
-    test_sum(10001);
-    test_prod(35);
-    test_min(10003);
-    test_minloc(10003);
-    test_max(10007);
-    test_maxloc(10007);
-    test_minmaxloc(10007);
-    test_BAnd(35);
-    test_BOr(35);
-    test_BXor(35);
-    test_LAnd(35);
-    test_LOr(35);
-    test_LXor(35);
+    test_sum( 10001 );
+    test_prod( 35 );
+    test_min( 10003 );
+    test_minloc( 10003 );
+    test_max( 10007 );
+    test_maxloc( 10007 );
+    test_minmaxloc( 10007 );
+    test_BAnd( 35 );
+    test_BOr( 35 );
+    test_BXor( 35 );
+    test_LAnd( 35 );
+    test_LOr( 35 );
+    test_LXor( 35 );
   }
 
   static void execute_basic() {
-    test_sum(10001);
-    test_prod(35);
+    test_sum( 10001 );
+    test_prod( 35 );
   }
 };
-}
-
-/*--------------------------------------------------------------------------*/
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestScan.hpp b/lib/kokkos/core/unit_test/TestScan.hpp
index 1a9811a854f85e2b7ef918ff2d1e36b268ae6c28..547e03497601a0a7da8bc3d0027ee9fef603e196 100644
--- a/lib/kokkos/core/unit_test/TestScan.hpp
+++ b/lib/kokkos/core/unit_test/TestScan.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,82 +36,81 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-/*--------------------------------------------------------------------------*/
-
 #include <stdio.h>
 
 namespace Test {
 
-template< class Device , class WorkSpec = size_t >
+template< class Device, class WorkSpec = size_t >
 struct TestScan {
+  typedef  Device    execution_space;
+  typedef  long int  value_type;
 
-  typedef  Device    execution_space ;
-  typedef  long int  value_type ;
-
-  Kokkos::View<int,Device,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View< int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int iwork , value_type & update , const bool final_pass ) const
+  void operator()( const int iwork, value_type & update, const bool final_pass ) const
   {
-    const value_type n = iwork + 1 ;
-    const value_type imbalance = ( (1000 <= n) && (0 == n % 1000) ) ? 1000 : 0 ;
+    const value_type n = iwork + 1;
+    const value_type imbalance = ( ( 1000 <= n ) && ( 0 == n % 1000 ) ) ? 1000 : 0;
 
     // Insert an artificial load imbalance
 
-    for ( value_type i = 0 ; i < imbalance ; ++i ) { ++update ; }
+    for ( value_type i = 0; i < imbalance; ++i ) { ++update; }
 
-    update += n - imbalance ;
+    update += n - imbalance;
 
     if ( final_pass ) {
       const value_type answer = n & 1 ? ( n * ( ( n + 1 ) / 2 ) ) : ( ( n / 2 ) * ( n + 1 ) );
 
       if ( answer != update ) {
         errors()++;
-        if(errors()<20)
-          printf("TestScan(%d,%ld) != %ld\n",iwork,update,answer);
+
+        if ( errors() < 20 ) {
+          printf( "TestScan(%d,%ld) != %ld\n", iwork, update, answer );
+        }
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init( value_type & update ) const { update = 0 ; }
+  void init( value_type & update ) const { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile       value_type & update ,
+  void join( volatile       value_type & update,
              volatile const value_type & input ) const
-  { update += input ; }
+  { update += input; }
 
   TestScan( const WorkSpec & N )
-    {
-      Kokkos::View<int,Device > errors_a("Errors");
-      Kokkos::deep_copy(errors_a,0);
-      errors = errors_a;
-      parallel_scan( N , *this );
-    }
+  {
+    Kokkos::View< int, Device > errors_a( "Errors" );
+    Kokkos::deep_copy( errors_a, 0 );
+    errors = errors_a;
+
+    parallel_scan( N , *this );
+  }
 
   TestScan( const WorkSpec & Start , const WorkSpec & N )
-    {
-      typedef Kokkos::RangePolicy<execution_space> exec_policy ;
+  {
+    typedef Kokkos::RangePolicy< execution_space > exec_policy ;
 
-      Kokkos::View<int,Device > errors_a("Errors");
-      Kokkos::deep_copy(errors_a,0);
-      errors = errors_a;
+    Kokkos::View< int, Device > errors_a( "Errors" );
+    Kokkos::deep_copy( errors_a, 0 );
+    errors = errors_a;
 
-      parallel_scan( exec_policy( Start , N ) , *this );
-    }
+    parallel_scan( exec_policy( Start , N ) , *this );
+  }
 
-  static void test_range( const WorkSpec & begin , const WorkSpec & end )
-    {
-      for ( WorkSpec i = begin ; i < end ; ++i ) {
-        (void) TestScan( i );
-      }
+  static void test_range( const WorkSpec & begin, const WorkSpec & end )
+  {
+    for ( WorkSpec i = begin; i < end; ++i ) {
+      (void) TestScan( i );
     }
+  }
 };
 
-}
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
index 291f9f60e4b8050e11b653f3f3ae975f1d1e8c91..6eca6bb38db08d562672d39b32eb22663da9f5b2 100644
--- a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
+++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -54,162 +54,157 @@
 namespace Test {
 
 struct SharedAllocDestroy {
+  volatile int * count;
 
-  volatile int * count ;
-
-  SharedAllocDestroy() = default ;
+  SharedAllocDestroy() = default;
   SharedAllocDestroy( int * arg ) : count( arg ) {}
 
   void destroy_shared_allocation()
-    {
-      Kokkos::atomic_increment( count );
-    }
-
+  {
+    Kokkos::atomic_increment( count );
+  }
 };
 
-template< class MemorySpace , class ExecutionSpace >
+template< class MemorySpace, class ExecutionSpace >
 void test_shared_alloc()
 {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  typedef const Kokkos::Impl::SharedAllocationHeader                               Header;
+  typedef Kokkos::Impl::SharedAllocationTracker                                    Tracker;
+  typedef Kokkos::Impl::SharedAllocationRecord< void, void >                       RecordBase;
+  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace, void >                RecordMemS;
+  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace, SharedAllocDestroy >  RecordFull;
 
-  typedef const Kokkos::Impl::SharedAllocationHeader   Header ;
-  typedef Kokkos::Impl::SharedAllocationTracker  Tracker ;
-  typedef Kokkos::Impl::SharedAllocationRecord< void , void >                       RecordBase ;
-  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace , void >                RecordMemS ;
-  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace , SharedAllocDestroy >  RecordFull ;
-
-  static_assert( sizeof(Tracker) == sizeof(int*), "SharedAllocationTracker has wrong size!" );
+  static_assert( sizeof( Tracker ) == sizeof( int* ), "SharedAllocationTracker has wrong size!" );
 
-  MemorySpace s ;
+  MemorySpace s;
 
-  const size_t N = 1200 ;
-  const size_t size = 8 ;
+  const size_t N = 1200;
+  const size_t size = 8;
 
   RecordMemS * rarray[ N ];
   Header     * harray[ N ];
 
-  RecordMemS ** const r = rarray ;
-  Header     ** const h = harray ;
+  RecordMemS ** const r = rarray;
+  Header     ** const h = harray;
+
+  Kokkos::RangePolicy< ExecutionSpace > range( 0, N );
 
-  Kokkos::RangePolicy< ExecutionSpace > range(0,N);
-  
-  //----------------------------------------
   {
-  // Since always executed on host space, leave [=]
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      char name[64] ;
-      sprintf(name,"test_%.2d",int(i));
+    // Since always executed on host space, leave [=]
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      char name[64];
+      sprintf( name, "test_%.2d", int( i ) );
 
-      r[i] = RecordMemS::allocate( s , name , size * ( i + 1 ) );
+      r[i] = RecordMemS::allocate( s, name, size * ( i + 1 ) );
       h[i] = Header::get_header( r[i]->data() );
 
-      ASSERT_EQ( r[i]->use_count() , 0 );
+      ASSERT_EQ( r[i]->use_count(), 0 );
 
-      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+      for ( size_t j = 0; j < ( i / 10 ) + 1; ++j ) RecordBase::increment( r[i] );
 
-      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
-      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+      ASSERT_EQ( r[i]->use_count(), ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i], RecordMemS::get_record( r[i]->data() ) );
     });
 
     // Sanity check for the whole set of allocation records to which this record belongs.
     RecordBase::is_sane( r[0] );
-    // RecordMemS::print_records( std::cout , s , true );
+    // RecordMemS::print_records( std::cout, s, true );
 
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      while ( 0 != ( r[i] = static_cast< RecordMemS * >( RecordBase::decrement( r[i] ) ) ) ) {
         if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
       }
     });
   }
-  //----------------------------------------
+
   {
-    int destroy_count = 0 ;
-    SharedAllocDestroy counter( & destroy_count );
+    int destroy_count = 0;
+    SharedAllocDestroy counter( &destroy_count );
 
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      char name[64] ;
-      sprintf(name,"test_%.2d",int(i));
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      char name[64];
+      sprintf( name, "test_%.2d", int( i ) );
 
-      RecordFull * rec = RecordFull::allocate( s , name , size * ( i + 1 ) );
+      RecordFull * rec = RecordFull::allocate( s, name, size * ( i + 1 ) );
 
-      rec->m_destroy = counter ;
+      rec->m_destroy = counter;
 
-      r[i] = rec ;
+      r[i] = rec;
       h[i] = Header::get_header( r[i]->data() );
 
-      ASSERT_EQ( r[i]->use_count() , 0 );
+      ASSERT_EQ( r[i]->use_count(), 0 );
 
-      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+      for ( size_t j = 0; j < ( i / 10 ) + 1; ++j ) RecordBase::increment( r[i] );
 
-      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
-      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+      ASSERT_EQ( r[i]->use_count(), ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i], RecordMemS::get_record( r[i]->data() ) );
     });
 
     RecordBase::is_sane( r[0] );
 
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      while ( 0 != ( r[i] = static_cast< RecordMemS * >( RecordBase::decrement( r[i] ) ) ) ) {
         if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
       }
     });
 
-    ASSERT_EQ( destroy_count , int(N) );
+    ASSERT_EQ( destroy_count, int( N ) );
   }
 
-  //----------------------------------------
   {
-    int destroy_count = 0 ;
+    int destroy_count = 0;
 
     {
-      RecordFull * rec = RecordFull::allocate( s , "test" , size );
+      RecordFull * rec = RecordFull::allocate( s, "test", size );
 
-      // ... Construction of the allocated { rec->data() , rec->size() }
+      // ... Construction of the allocated { rec->data(), rec->size() }
 
-      // Copy destruction function object into the allocation record
+      // Copy destruction function object into the allocation record.
       rec->m_destroy = SharedAllocDestroy( & destroy_count );
 
-      ASSERT_EQ( rec->use_count() , 0 );
+      ASSERT_EQ( rec->use_count(), 0 );
 
-      // Start tracking, increments the use count from 0 to 1
-      Tracker track ;
+      // Start tracking, increments the use count from 0 to 1.
+      Tracker track;
 
       track.assign_allocated_record_to_uninitialized( rec );
 
-      ASSERT_EQ( rec->use_count() , 1 );
-      ASSERT_EQ( track.use_count() , 1 );
+      ASSERT_EQ( rec->use_count(), 1 );
+      ASSERT_EQ( track.use_count(), 1 );
+
+      // Verify construction / destruction increment.
+      for ( size_t i = 0; i < N; ++i ) {
+        ASSERT_EQ( rec->use_count(), 1 );
 
-      // Verify construction / destruction increment
-      for ( size_t i = 0 ; i < N ; ++i ) {
-        ASSERT_EQ( rec->use_count() , 1 );
         {
-          Tracker local_tracker ;
+          Tracker local_tracker;
           local_tracker.assign_allocated_record_to_uninitialized( rec );
-          ASSERT_EQ( rec->use_count() , 2 );
-          ASSERT_EQ( local_tracker.use_count() , 2 );
+          ASSERT_EQ( rec->use_count(), 2 );
+          ASSERT_EQ( local_tracker.use_count(), 2 );
         }
-        ASSERT_EQ( rec->use_count() , 1 );
-        ASSERT_EQ( track.use_count() , 1 );
+
+        ASSERT_EQ( rec->use_count(), 1 );
+        ASSERT_EQ( track.use_count(), 1 );
       }
 
-      Kokkos::parallel_for( range , [=]( size_t i ){
-        Tracker local_tracker ;
+      Kokkos::parallel_for( range, [=] ( size_t i ) {
+        Tracker local_tracker;
         local_tracker.assign_allocated_record_to_uninitialized( rec );
-        ASSERT_GT( rec->use_count() , 1 );
+        ASSERT_GT( rec->use_count(), 1 );
       });
 
-      ASSERT_EQ( rec->use_count() , 1 );
-      ASSERT_EQ( track.use_count() , 1 );
+      ASSERT_EQ( rec->use_count(), 1 );
+      ASSERT_EQ( track.use_count(), 1 );
 
       // Destruction of 'track' object deallocates the 'rec' and invokes the destroy function object.
     }
 
-    ASSERT_EQ( destroy_count , 1 );
+    ASSERT_EQ( destroy_count, 1 );
   }
 
 #endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
 
 }
 
-
-}
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestSynchronic.cpp b/lib/kokkos/core/unit_test/TestSynchronic.cpp
deleted file mode 100644
index dc1abbd8b3d6a0532408956a5a7bffff1ec2f3f6..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/unit_test/TestSynchronic.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-//#undef _WIN32_WINNT
-//#define _WIN32_WINNT 0x0602
-
-#if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || \
-	defined(__APPLE__) || defined(__ARM_ARCH_8A) || defined(_CRAYC)
-
-// Skip for now
-
-#else
-
-#include <gtest/gtest.h>
-
-#ifdef USEOMP
-#include <omp.h>
-#endif
-
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include <map>
-#include <cstring>
-#include <ctime>
-
-//#include <details/config>
-//#undef __SYNCHRONIC_COMPATIBLE
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <impl/Kokkos_Synchronic_n3998.hpp>
-
-#include "TestSynchronic.hpp"
-
-// Uncomment to allow test to dump output
-//#define VERBOSE_TEST
-
-namespace Test {
-
-unsigned next_table[] =
-    {
-        0, 1, 2, 3,         //0-3
-        4, 4, 6, 6,         //4-7
-        8, 8, 8, 8,         //8-11
-        12, 12, 12, 12,     //12-15
-        16, 16, 16, 16,     //16-19
-        16, 16, 16, 16,     //20-23
-        24, 24, 24, 24,     //24-27
-        24, 24, 24, 24,     //28-31
-        32, 32, 32, 32,     //32-35
-        32, 32, 32, 32,     //36-39
-        40, 40, 40, 40,     //40-43
-        40, 40, 40, 40,     //44-47
-        48, 48, 48, 48,     //48-51
-        48, 48, 48, 48,     //52-55
-        56, 56, 56, 56,     //56-59
-        56, 56, 56, 56,     //60-63
-    };
-
-//change this if you want to allow oversubscription of the system, by default only the range {1-(system size)} is tested
-#define FOR_GAUNTLET(x) for(unsigned x = (std::min)(std::thread::hardware_concurrency()*8,unsigned(sizeof(next_table)/sizeof(unsigned))); x; x = next_table[x-1])
-
-//set this to override the benchmark of barriers to use OMP barriers instead of n3998 std::barrier
-//#define USEOMP
-
-#if defined(__SYNCHRONIC_COMPATIBLE)
-    #define PREFIX "futex-"
-#else
-    #define PREFIX "backoff-"
-#endif
-
-//this test uses a custom Mersenne twister to eliminate implementation variation
-MersenneTwister mt;
-
-int dummya = 1, dummyb =1;
-
-int dummy1 = 1;
-std::atomic<int> dummy2(1);
-std::atomic<int> dummy3(1);
-
-double time_item(int const count = (int)1E8)  {
-
-    clock_t const start = clock();
-
-    for(int i = 0;i < count; ++i)
-        mt.integer();
-
-    clock_t const end = clock();
-    double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC);
-
-    return elapsed_seconds / count;
-}
-double time_nil(int const count = (int)1E08)  {
-
-    clock_t const start = clock();
-
-    dummy3 = count;
-    for(int i = 0;i < (int)1E6; ++i) {
-        if(dummy1) {
-            // Do some work while holding the lock
-            int workunits = dummy3;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f);
-            for (int j = 1; j < workunits; j++)
-                dummy1 &= j;       // Do one work unit
-            dummy2.fetch_add(dummy1,std::memory_order_relaxed);
-        }
-    }
-
-    clock_t const end = clock();
-    double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC);
-
-    return elapsed_seconds / count;
-}
-
-
-template <class mutex_type>
-void testmutex_inner(mutex_type& m, std::atomic<int>& t,std::atomic<int>& wc,std::atomic<int>& wnc, int const num_iterations,
-                     int const num_items_critical, int const num_items_noncritical, MersenneTwister& mtc, MersenneTwister& mtnc, bool skip) {
-
-    for(int k = 0; k < num_iterations; ++k) {
-
-        if(num_items_noncritical) {
-            // Do some work without holding the lock
-            int workunits = num_items_noncritical;//(int) (mtnc.poissonInterval((float)num_items_noncritical) + 0.5f);
-            for (int i = 1; i < workunits; i++)
-                mtnc.integer();       // Do one work unit
-            wnc.fetch_add(workunits,std::memory_order_relaxed);
-        }
-
-        t.fetch_add(1,std::memory_order_relaxed);
-
-        if(!skip) {
-            std::unique_lock<mutex_type> l(m);
-            if(num_items_critical) {
-                // Do some work while holding the lock
-                int workunits = num_items_critical;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f);
-                for (int i = 1; i < workunits; i++)
-                    mtc.integer();       // Do one work unit
-                wc.fetch_add(workunits,std::memory_order_relaxed);
-            }
-        }
-    }
-}
-template <class mutex_type>
-void testmutex_outer(std::map<std::string,std::vector<double>>& results, std::string const& name, double critical_fraction, double critical_duration) {
-
-    std::ostringstream truename;
-    truename << name << " (f=" << critical_fraction << ",d=" << critical_duration << ")";
-
-    std::vector<double>& data = results[truename.str()];
-
-    double const workItemTime = time_item() ,
-                 nilTime = time_nil();
-
-    int const num_items_critical = (critical_duration <= 0 ? 0 : (std::max)( int(critical_duration / workItemTime + 0.5), int(100 * nilTime / workItemTime + 0.5))),
-              num_items_noncritical = (num_items_critical <= 0 ? 0 : int( ( 1 - critical_fraction ) * num_items_critical / critical_fraction + 0.5 ));
-
-    FOR_GAUNTLET(num_threads) {
-
-        //Kokkos::Impl::portable_sleep(std::chrono::microseconds(2000000));
-
-        int const num_iterations = (num_items_critical + num_items_noncritical != 0) ?
-#ifdef __SYNCHRONIC_JUST_YIELD
-                                        int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) :
-#else
-                                        int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) :
-#endif
-#ifdef WIN32
-                                        int( 1 / workItemTime / (20 * num_threads * num_threads) );
-#else
-                                        int( 1 / workItemTime / (200 * num_threads * num_threads) );
-#endif
-
-#ifdef VERBOSE_TEST
-        std::cerr << "running " << truename.str() << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\n" << std::flush;
-#endif
-
-
-        std::atomic<int> t[2], wc[2], wnc[2];
-
-        clock_t start[2], end[2];
-        for(int pass = 0; pass < 2; ++pass) {
-
-            t[pass] = 0;
-            wc[pass] = 0;
-            wnc[pass] = 0;
-
-            srand(num_threads);
-            std::vector<MersenneTwister> randomsnc(num_threads),
-                                         randomsc(num_threads);
-
-            mutex_type m;
-
-            start[pass] = clock();
-#ifdef USEOMP
-            omp_set_num_threads(num_threads);
-            std::atomic<int> _j(0);
-            #pragma omp parallel
-            {
-                int const j = _j.fetch_add(1,std::memory_order_relaxed);
-                testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0);
-                num_threads = omp_get_num_threads();
-            }
-#else
-            std::vector<std::thread*> threads(num_threads);
-            for(unsigned j = 0; j < num_threads; ++j)
-                threads[j] = new std::thread([&,j](){
-                        testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0);
-                    }
-                );
-            for(unsigned j = 0; j < num_threads; ++j) {
-                threads[j]->join();
-                delete threads[j];
-            }
-#endif
-            end[pass] = clock();
-        }
-        if(t[0] != t[1]) throw std::string("mismatched iteration counts");
-        if(wnc[0] != wnc[1]) throw std::string("mismatched work item counts");
-
-        double elapsed_seconds_0 = (end[0] - start[0]) / double(CLOCKS_PER_SEC),
-               elapsed_seconds_1 = (end[1] - start[1]) / double(CLOCKS_PER_SEC);
-        double time = (elapsed_seconds_1 - elapsed_seconds_0 - wc[1]*workItemTime) / num_iterations;
-
-        data.push_back(time);
-#ifdef VERBOSE_TEST
-        std::cerr << truename.str() << " : " << num_threads << "," << elapsed_seconds_1 / num_iterations << " - " << elapsed_seconds_0 / num_iterations << " - " << wc[1]*workItemTime/num_iterations << " = " << time << "                                                 \n";
-#endif
-    }
-}
-
-template <class barrier_type>
-void testbarrier_inner(barrier_type& b, int const num_threads, int const j, std::atomic<int>& t,std::atomic<int>& w,
-                       int const num_iterations_odd, int const num_iterations_even,
-                       int const num_items_noncritical, MersenneTwister& arg_mt, bool skip) {
-
-    for(int k = 0; k < (std::max)(num_iterations_even,num_iterations_odd); ++k) {
-
-        if(k >= (~j & 0x1 ? num_iterations_odd : num_iterations_even )) {
-            if(!skip)
-                b.arrive_and_drop();
-            break;
-        }
-
-        if(num_items_noncritical) {
-            // Do some work without holding the lock
-            int workunits = (int) (arg_mt.poissonInterval((float)num_items_noncritical) + 0.5f);
-            for (int i = 1; i < workunits; i++)
-                arg_mt.integer();       // Do one work unit
-            w.fetch_add(workunits,std::memory_order_relaxed);
-        }
-
-        t.fetch_add(1,std::memory_order_relaxed);
-
-        if(!skip) {
-            int const thiscount = (std::min)(k+1,num_iterations_odd)*((num_threads>>1)+(num_threads&1)) + (std::min)(k+1,num_iterations_even)*(num_threads>>1);
-            if(t.load(std::memory_order_relaxed) > thiscount) {
-                std::cerr << "FAILURE: some threads have run ahead of the barrier (" << t.load(std::memory_order_relaxed) << ">" <<  thiscount << ").\n";
-                EXPECT_TRUE(false);
-            }
-#ifdef USEOMP
-            #pragma omp barrier
-#else
-            b.arrive_and_wait();
-#endif
-            if(t.load(std::memory_order_relaxed) < thiscount) {
-                std::cerr << "FAILURE: some threads have fallen behind the barrier (" << t.load(std::memory_order_relaxed) << "<" << thiscount << ").\n";
-                EXPECT_TRUE(false);
-            }
-        }
-    }
-}
-template <class barrier_type>
-void testbarrier_outer(std::map<std::string,std::vector<double>>& results, std::string const& name, double barrier_frequency, double phase_duration, bool randomIterations = false) {
-
-    std::vector<double>& data = results[name];
-
-    double const workItemTime = time_item();
-    int const num_items_noncritical = int( phase_duration / workItemTime + 0.5 );
-
-    FOR_GAUNTLET(num_threads) {
-
-        int const num_iterations = int( barrier_frequency );
-#ifdef VERBOSE_TEST
-        std::cerr << "running " << name << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\r" << std::flush;
-#endif
-
-        srand(num_threads);
-
-        MersenneTwister local_mt;
-        int const num_iterations_odd = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations,
-                  num_iterations_even = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations;
-
-        std::atomic<int> t[2], w[2];
-        std::chrono::time_point<std::chrono::high_resolution_clock> start[2], end[2];
-        for(int pass = 0; pass < 2; ++pass) {
-
-            t[pass] = 0;
-            w[pass] = 0;
-
-            srand(num_threads);
-            std::vector<MersenneTwister> randoms(num_threads);
-
-            barrier_type b(num_threads);
-
-            start[pass] = std::chrono::high_resolution_clock::now();
-#ifdef USEOMP
-            omp_set_num_threads(num_threads);
-            std::atomic<int> _j(0);
-            #pragma omp parallel
-            {
-                int const j = _j.fetch_add(1,std::memory_order_relaxed);
-                testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0);
-                num_threads = omp_get_num_threads();
-            }
-#else
-            std::vector<std::thread*> threads(num_threads);
-            for(unsigned j = 0; j < num_threads; ++j)
-                threads[j] = new std::thread([&,j](){
-                    testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0);
-                });
-            for(unsigned j = 0; j < num_threads; ++j) {
-                threads[j]->join();
-                delete threads[j];
-            }
-#endif
-            end[pass] = std::chrono::high_resolution_clock::now();
-        }
-
-        if(t[0] != t[1]) throw std::string("mismatched iteration counts");
-        if(w[0] != w[1]) throw std::string("mismatched work item counts");
-
-        int const phases = (std::max)(num_iterations_odd, num_iterations_even);
-
-        std::chrono::duration<double> elapsed_seconds_0 = end[0]-start[0],
-                                      elapsed_seconds_1 = end[1]-start[1];
-        double const time = (elapsed_seconds_1.count() - elapsed_seconds_0.count()) / phases;
-
-        data.push_back(time);
-#ifdef VERBOSE_TEST
-        std::cerr << name << " : " << num_threads << "," << elapsed_seconds_1.count() / phases << " - " << elapsed_seconds_0.count() / phases << " = " << time << "                                                 \n";
-#endif
-    }
-}
-
-template <class... T>
-struct mutex_tester;
-template <class F>
-struct mutex_tester<F> {
-    static void run(std::map<std::string,std::vector<double>>& results, std::string const name[], double critical_fraction, double critical_duration) {
-        testmutex_outer<F>(results, *name, critical_fraction, critical_duration);
-    }
-};
-template <class F, class... T>
-struct mutex_tester<F,T...> {
-    static void run(std::map<std::string,std::vector<double>>& results, std::string const name[], double critical_fraction, double critical_duration) {
-        mutex_tester<F>::run(results, name, critical_fraction, critical_duration);
-        mutex_tester<T...>::run(results, ++name, critical_fraction, critical_duration);
-    }
-};
-
-TEST( synchronic, main )
-{
-    //warm up
-    time_item();
-
-    //measure up
-#ifdef VERBOSE_TEST
-    std::cerr << "measuring work item speed...\r";
-    std::cerr << "work item speed is " << time_item() << " per item, nil is " << time_nil() << "\n";
-#endif
-    try {
-
-      std::pair<double,double> testpoints[] = { {1, 0}, /*{1E-1, 10E-3}, {5E-1, 2E-6},  {3E-1, 50E-9},*/ };
-        for(auto x : testpoints ) {
-
-            std::map<std::string,std::vector<double>> results;
-
-            //testbarrier_outer<std::barrier>(results, PREFIX"bar 1khz 100us", 1E3, x.second);
-
-            std::string const names[] = {
-                PREFIX"tkt", PREFIX"mcs", PREFIX"ttas", PREFIX"std"
-#ifdef WIN32
-                ,PREFIX"srw"
-#endif
-            };
-
-            //run -->
-
-            mutex_tester<
-                ticket_mutex, mcs_mutex, ttas_mutex, std::mutex
-#ifdef WIN32
-                ,srw_mutex
-#endif
-            >::run(results, names, x.first, x.second);
-
-            //<-- run
-
-#ifdef VERBOSE_TEST
-            std::cout << "threads";
-            for(auto & i : results)
-                std::cout << ",\"" << i.first << '\"';
-            std::cout << std::endl;
-            int j = 0;
-            FOR_GAUNTLET(num_threads) {
-                std::cout << num_threads;
-                for(auto & i : results)
-                    std::cout << ',' << i.second[j];
-                std::cout << std::endl;
-                ++j;
-            }
-#endif
-        }
-    }
-    catch(std::string & e) {
-        std::cerr << "EXCEPTION : " << e << std::endl;
-        EXPECT_TRUE( false );
-    }
-}
-
-} // namespace Test
-
-#endif
diff --git a/lib/kokkos/core/unit_test/TestSynchronic.hpp b/lib/kokkos/core/unit_test/TestSynchronic.hpp
deleted file mode 100644
index f4341b97815b8d70956dfb85cf0d41a4f07bab4d..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/unit_test/TestSynchronic.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef TEST_SYNCHRONIC_HPP
-#define TEST_SYNCHRONIC_HPP
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <mutex>
-#include <cmath>
-
-namespace Test {
-
-template <bool truly>
-struct dumb_mutex {
-
-    dumb_mutex () : locked(0) {
-    }
-
-    void lock() {
-        while(1) {
-            bool state = false;
-            if (locked.compare_exchange_weak(state,true,std::memory_order_acquire)) {
-                break;
-            }
-            while (locked.load(std::memory_order_relaxed)) {
-              if (!truly) {
-                Kokkos::Impl::portable_yield();
-              }
-            }
-        }
-    }
-
-    void unlock() {
-        locked.store(false,std::memory_order_release);
-    }
-
-private :
-    std::atomic<bool> locked;
-};
-
-#ifdef WIN32
-#include <winsock2.h>
-#include <windows.h>
-#include <synchapi.h>
-struct srw_mutex {
-
-    srw_mutex () {
-        InitializeSRWLock(&_lock);
-    }
-
-    void lock() {
-        AcquireSRWLockExclusive(&_lock);
-    }
-    void unlock() {
-        ReleaseSRWLockExclusive(&_lock);
-    }
-
-private :
-    SRWLOCK _lock;
-};
-#endif
-
-struct ttas_mutex {
-
-    ttas_mutex() : locked(false) {
-    }
-
-	ttas_mutex(const ttas_mutex&) = delete;
-	ttas_mutex& operator=(const ttas_mutex&) = delete;
-
-    void lock() {
-        for(int i = 0;; ++i) {
-            bool state = false;
-            if(locked.compare_exchange_weak(state,true,std::memory_order_relaxed,Kokkos::Impl::notify_none))
-                break;
-            locked.expect_update(true);
-        }
-        std::atomic_thread_fence(std::memory_order_acquire);
-    }
-    void unlock() {
-        locked.store(false,std::memory_order_release);
-    }
-
-private :
-    Kokkos::Impl::synchronic<bool> locked;
-};
-
-struct ticket_mutex {
-
-    ticket_mutex() : active(0), queue(0) {
-    }
-
-	ticket_mutex(const ticket_mutex&) = delete;
-	ticket_mutex& operator=(const ticket_mutex&) = delete;
-
-    void lock() {
-        int const me = queue.fetch_add(1, std::memory_order_relaxed);
-        while(me != active.load_when_equal(me, std::memory_order_acquire))
-            ;
-    }
-
-    void unlock() {
-        active.fetch_add(1,std::memory_order_release);
-    }
-private :
-    Kokkos::Impl::synchronic<int> active;
-    std::atomic<int> queue;
-};
-
-struct mcs_mutex {
-
-    mcs_mutex() : head(nullptr) {
-    }
-
-	mcs_mutex(const mcs_mutex&) = delete;
-	mcs_mutex& operator=(const mcs_mutex&) = delete;
-
-    struct unique_lock {
-
-        unique_lock(mcs_mutex & arg_m) : m(arg_m), next(nullptr), ready(false) {
-
-            unique_lock * const h = m.head.exchange(this,std::memory_order_acquire);
-            if(__builtin_expect(h != nullptr,0)) {
-                h->next.store(this,std::memory_order_seq_cst,Kokkos::Impl::notify_one);
-                while(!ready.load_when_not_equal(false,std::memory_order_acquire))
-                    ;
-            }
-        }
-
-	    unique_lock(const unique_lock&) = delete;
-	    unique_lock& operator=(const unique_lock&) = delete;
-
-        ~unique_lock() {
-            unique_lock * h = this;
-            if(__builtin_expect(!m.head.compare_exchange_strong(h,nullptr,std::memory_order_release, std::memory_order_relaxed),0)) {
-                unique_lock * n = next.load(std::memory_order_relaxed);
-                while(!n)
-                    n = next.load_when_not_equal(n,std::memory_order_relaxed);
-                n->ready.store(true,std::memory_order_release,Kokkos::Impl::notify_one);
-            }
-        }
-
-    private:
-        mcs_mutex & m;
-        Kokkos::Impl::synchronic<unique_lock*> next;
-        Kokkos::Impl::synchronic<bool> ready;
-    };
-
-private :
-    std::atomic<unique_lock*> head;
-};
-
-}
-
-namespace std {
-template<>
-struct unique_lock<Test::mcs_mutex> : Test::mcs_mutex::unique_lock {
-  unique_lock(Test::mcs_mutex & arg_m) : Test::mcs_mutex::unique_lock(arg_m) {
-  }
-  unique_lock(const unique_lock&) = delete;
-  unique_lock& operator=(const unique_lock&) = delete;
-};
-
-}
-
-/* #include <cmath> */
-#include <stdlib.h>
-
-namespace Test {
-
-//-------------------------------------
-//  MersenneTwister
-//-------------------------------------
-#define MT_IA  397
-#define MT_LEN 624
-
-class MersenneTwister
-{
-    volatile unsigned long m_buffer[MT_LEN][64/sizeof(unsigned long)];
-    volatile int m_index;
-
-public:
-    MersenneTwister() {
-        for (int i = 0; i < MT_LEN; i++)
-            m_buffer[i][0] = rand();
-        m_index = 0;
-        for (int i = 0; i < MT_LEN * 100; i++)
-            integer();
-    }
-    unsigned long integer() {
-        // Indices
-        int i = m_index;
-        int i2 = m_index + 1; if (i2 >= MT_LEN) i2 = 0; // wrap-around
-        int j = m_index + MT_IA; if (j >= MT_LEN) j -= MT_LEN; // wrap-around
-
-        // Twist
-        unsigned long s = (m_buffer[i][0] & 0x80000000) | (m_buffer[i2][0] & 0x7fffffff);
-        unsigned long r = m_buffer[j][0] ^ (s >> 1) ^ ((s & 1) * 0x9908B0DF);
-        m_buffer[m_index][0] = r;
-        m_index = i2;
-
-        // Swizzle
-        r ^= (r >> 11);
-        r ^= (r << 7) & 0x9d2c5680UL;
-        r ^= (r << 15) & 0xefc60000UL;
-        r ^= (r >> 18);
-        return r;
-    }
-    float poissonInterval(float ooLambda) {
-        return -logf(1.0f - integer() * 2.3283e-10f) * ooLambda;
-    }
-};
-
-} // namespace Test
-
-#endif //TEST_HPP
diff --git a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
index 1134553980f8a63351f85a86b33537a35d52644c..57e47d4baa0d177dca9379cf43a05742af2519d1 100644
--- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
+++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,11 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-
 #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
 #define KOKKOS_UNITTEST_TASKSCHEDULER_HPP
 
@@ -51,9 +50,6 @@
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 namespace TestTaskScheduler {
 
 namespace {
@@ -61,14 +57,14 @@ namespace {
 inline
 long eval_fib( long n )
 {
-  constexpr long mask = 0x03 ;
+  constexpr long mask = 0x03;
 
-  long fib[4] = { 0 , 1 , 1 , 2 };
+  long fib[4] = { 0, 1, 1, 2 };
 
-  for ( long i = 2 ; i <= n ; ++i ) {
+  for ( long i = 2; i <= n; ++i ) {
     fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
   }
-  
+
   return fib[ n & mask ];
 }
 
@@ -77,100 +73,93 @@ long eval_fib( long n )
 template< typename Space >
 struct TestFib
 {
-  typedef Kokkos::TaskScheduler<Space>  policy_type ;
-  typedef Kokkos::Future<long,Space> future_type ;
-  typedef long value_type ;
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< long, Space >   future_type;
+  typedef long                            value_type;
 
-  policy_type policy ;
-  future_type fib_m1 ;
-  future_type fib_m2 ;
-  const value_type n ;
+  sched_type  sched;
+  future_type fib_m1;
+  future_type fib_m2;
+  const value_type n;
 
   KOKKOS_INLINE_FUNCTION
-  TestFib( const policy_type & arg_policy , const value_type arg_n )
-    : policy(arg_policy)
-    , fib_m1() , fib_m2()
-    , n( arg_n )
-    {}
+  TestFib( const sched_type & arg_sched, const value_type arg_n )
+    : sched( arg_sched ), fib_m1(), fib_m2(), n( arg_n ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type & , value_type & result )
-    {
+  void operator()( typename sched_type::member_type &, value_type & result )
+  {
 #if 0
-      printf( "\nTestFib(%ld) %d %d\n"
-             , n
-             , int( ! fib_m1.is_null() )
-             , int( ! fib_m2.is_null() )
-             );
+    printf( "\nTestFib(%ld) %d %d\n", n, int( !fib_m1.is_null() ), int( !fib_m2.is_null() ) );
 #endif
 
-      if ( n < 2 ) {
-        result = n ;
-      }
-      else if ( ! fib_m2.is_null() && ! fib_m1.is_null() ) {
-        result = fib_m1.get() + fib_m2.get();
-      }
-      else {
-
-        // Spawn new children and respawn myself to sum their results:
-        // Spawn lower value at higher priority as it has a shorter
-        // path to completion.
-
-        fib_m2 = policy.task_spawn( TestFib(policy,n-2)
-                                  , Kokkos::TaskSingle
-                                  , Kokkos::TaskHighPriority );
+    if ( n < 2 ) {
+      result = n;
+    }
+    else if ( !fib_m2.is_null() && !fib_m1.is_null() ) {
+      result = fib_m1.get() + fib_m2.get();
+    }
+    else {
+      // Spawn new children and respawn myself to sum their results.
+      // Spawn lower value at higher priority as it has a shorter
+      // path to completion.
 
-        fib_m1 = policy.task_spawn( TestFib(policy,n-1)
-                                  , Kokkos::TaskSingle );
+      fib_m2 = Kokkos::task_spawn( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
+                                 , TestFib( sched, n - 2 ) );
 
-        Kokkos::Future<Space> dep[] = { fib_m1 , fib_m2 };
+      fib_m1 = Kokkos::task_spawn( Kokkos::TaskSingle( sched )
+                                 , TestFib( sched, n - 1 ) );
 
-        Kokkos::Future<Space> fib_all = policy.when_all( 2 , dep );
+      Kokkos::Future< Space > dep[] = { fib_m1, fib_m2 };
+      Kokkos::Future< Space > fib_all = Kokkos::when_all( dep, 2 );
 
-        if ( ! fib_m2.is_null() && ! fib_m1.is_null() && ! fib_all.is_null() ) {
-          // High priority to retire this branch
-          policy.respawn( this , Kokkos::TaskHighPriority , fib_all );
-        }
-        else {
+      if ( !fib_m2.is_null() && !fib_m1.is_null() && !fib_all.is_null() ) {
+        // High priority to retire this branch.
+        Kokkos::respawn( this, fib_all, Kokkos::TaskPriority::High );
+      }
+      else {
 #if 1
-      printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
-             , n
-             , policy.allocation_capacity()
-             , policy.allocated_task_count_max()
-             , policy.allocated_task_count_accum()
-             );
+        printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+               , n
+               , sched.allocation_capacity()
+               , sched.allocated_task_count_max()
+               , sched.allocated_task_count_accum()
+               );
 #endif
-          Kokkos::abort("TestFib insufficient memory");
 
-        }
+        Kokkos::abort( "TestFib insufficient memory" );
+
       }
     }
+  }
 
-  static void run( int i , size_t MemoryCapacity = 16000 )
-    {
-      typedef typename policy_type::memory_space memory_space ;
+  static void run( int i, size_t MemoryCapacity = 16000 )
+  {
+    typedef typename sched_type::memory_space memory_space;
 
-      enum { Log2_SuperBlockSize = 12 };
+    enum { Log2_SuperBlockSize = 12 };
 
-      policy_type root_policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+    sched_type root_sched( memory_space(), MemoryCapacity, Log2_SuperBlockSize );
 
-      future_type f = root_policy.host_spawn( TestFib(root_policy,i) , Kokkos::TaskSingle );
-      Kokkos::wait( root_policy );
-      ASSERT_EQ( eval_fib(i) , f.get() );
+    future_type f = Kokkos::host_spawn( Kokkos::TaskSingle( root_sched )
+                                      , TestFib( root_sched, i ) );
+
+    Kokkos::wait( root_sched );
+
+    ASSERT_EQ( eval_fib( i ), f.get() );
 
 #if 0
-      fprintf( stdout , "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
-             , i
-             , int(root_policy.template spawn_allocation_size<TestFib>())
-             , int(root_policy.when_all_allocation_size(2))
-             , root_policy.allocation_capacity()
-             , root_policy.allocated_task_count_max()
-             , root_policy.allocated_task_count_accum()
-             );
-      fflush( stdout );
+    fprintf( stdout, "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+           , i
+           , int(root_sched.template spawn_allocation_size<TestFib>())
+           , int(root_sched.when_all_allocation_size(2))
+           , root_sched.allocation_capacity()
+           , root_sched.allocated_task_count_max()
+           , root_sched.allocated_task_count_accum()
+           );
+    fflush( stdout );
 #endif
-    }
-
+  }
 };
 
 } // namespace TestTaskScheduler
@@ -181,73 +170,71 @@ namespace TestTaskScheduler {
 
 template< class Space >
 struct TestTaskDependence {
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< Space >         future_type;
+  typedef Kokkos::View< long, Space >     accum_type;
+  typedef void                            value_type;
 
-  typedef Kokkos::TaskScheduler<Space>  policy_type ;
-  typedef Kokkos::Future<Space>      future_type ;
-  typedef Kokkos::View<long,Space>   accum_type ;
-  typedef void value_type ;
-
-  policy_type  m_policy ;
-  accum_type   m_accum ;
-  long         m_count ;
+  sched_type  m_sched;
+  accum_type  m_accum;
+  long        m_count;
 
   KOKKOS_INLINE_FUNCTION
   TestTaskDependence( long n
-                    , const policy_type & arg_policy
-                    , const accum_type  & arg_accum )
-    : m_policy( arg_policy )
+                    , const sched_type & arg_sched
+                    , const accum_type & arg_accum )
+    : m_sched( arg_sched )
     , m_accum( arg_accum )
-    , m_count( n )
-    {}
+    , m_count( n ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type & )
-    {
-       enum { CHUNK = 8 };
-       const int n = CHUNK < m_count ? CHUNK : m_count ;
+  void operator()( typename sched_type::member_type & )
+  {
+    enum { CHUNK = 8 };
+    const int n = CHUNK < m_count ? CHUNK : m_count;
 
-       if ( 1 < m_count ) {
-         future_type f[ CHUNK ] ;
+    if ( 1 < m_count ) {
+      future_type f[ CHUNK ];
 
-         const int inc = ( m_count + n - 1 ) / n ;
+      const int inc = ( m_count + n - 1 ) / n;
 
-         for ( int i = 0 ; i < n ; ++i ) {
-           long begin = i * inc ;
-           long count = begin + inc < m_count ? inc : m_count - begin ;
-           f[i] = m_policy.task_spawn( TestTaskDependence(count,m_policy,m_accum) , Kokkos::TaskSingle );
-         }
+      for ( int i = 0; i < n; ++i ) {
+        long begin = i * inc;
+        long count = begin + inc < m_count ? inc : m_count - begin;
+        f[i] = Kokkos::task_spawn( Kokkos::TaskSingle( m_sched )
+                                 , TestTaskDependence( count, m_sched, m_accum ) );
+      }
 
-         m_count = 0 ;
+      m_count = 0;
 
-         m_policy.respawn( this , m_policy.when_all( n , f ) );
-       }
-       else if ( 1 == m_count ) {
-         Kokkos::atomic_increment( & m_accum() );
-       }
+      Kokkos::respawn( this, Kokkos::when_all( f, n ) );
+    }
+    else if ( 1 == m_count ) {
+      Kokkos::atomic_increment( & m_accum() );
     }
+  }
 
   static void run( int n )
-    {
-      typedef typename policy_type::memory_space memory_space ;
+  {
+    typedef typename sched_type::memory_space memory_space;
 
-      // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool
-      enum { MemoryCapacity = 16000 };
-      enum { Log2_SuperBlockSize = 12 };
-      policy_type policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+    // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool.
+    enum { MemoryCapacity = 16000 };
+    enum { Log2_SuperBlockSize = 12 };
+    sched_type sched( memory_space(), MemoryCapacity, Log2_SuperBlockSize );
 
-      accum_type accum("accum");
+    accum_type accum( "accum" );
 
-      typename accum_type::HostMirror host_accum =
-        Kokkos::create_mirror_view( accum );
+    typename accum_type::HostMirror host_accum = Kokkos::create_mirror_view( accum );
 
-      policy.host_spawn( TestTaskDependence(n,policy,accum) , Kokkos::TaskSingle );
+    Kokkos::host_spawn( Kokkos::TaskSingle( sched ), TestTaskDependence( n, sched, accum ) );
 
-      Kokkos::wait( policy );
+    Kokkos::wait( sched );
 
-      Kokkos::deep_copy( host_accum , accum );
+    Kokkos::deep_copy( host_accum, accum );
 
-      ASSERT_EQ( host_accum() , n );
-    }
+    ASSERT_EQ( host_accum(), n );
+  }
 };
 
 } // namespace TestTaskScheduler
@@ -258,294 +245,317 @@ namespace TestTaskScheduler {
 
 template< class ExecSpace >
 struct TestTaskTeam {
-
   //enum { SPAN = 8 };
   enum { SPAN = 33 };
   //enum { SPAN = 1 };
 
-  typedef void value_type ;
-  typedef Kokkos::TaskScheduler<ExecSpace>  policy_type ;
-  typedef Kokkos::Future<ExecSpace>      future_type ;
-  typedef Kokkos::View<long*,ExecSpace>  view_type ;
+  typedef void                                value_type;
+  typedef Kokkos::TaskScheduler< ExecSpace >  sched_type;
+  typedef Kokkos::Future< ExecSpace >         future_type;
+  typedef Kokkos::View< long*, ExecSpace >    view_type;
 
-  policy_type  policy ;
-  future_type  future ;
+  sched_type   sched;
+  future_type  future;
 
-  view_type  parfor_result ;
-  view_type  parreduce_check ;
-  view_type  parscan_result ;
-  view_type  parscan_check ;
-  const long nvalue ;
+  view_type   parfor_result;
+  view_type   parreduce_check;
+  view_type   parscan_result;
+  view_type   parscan_check;
+  const long  nvalue;
 
   KOKKOS_INLINE_FUNCTION
-  TestTaskTeam( const policy_type & arg_policy
-              , const view_type   & arg_parfor_result
-              , const view_type   & arg_parreduce_check
-              , const view_type   & arg_parscan_result
-              , const view_type   & arg_parscan_check
-              , const long          arg_nvalue )
-    : policy(arg_policy)
+  TestTaskTeam( const sched_type & arg_sched
+              , const view_type  & arg_parfor_result
+              , const view_type  & arg_parreduce_check
+              , const view_type  & arg_parscan_result
+              , const view_type  & arg_parscan_check
+              , const long         arg_nvalue )
+    : sched( arg_sched )
     , future()
     , parfor_result( arg_parfor_result )
     , parreduce_check( arg_parreduce_check )
     , parscan_result( arg_parscan_result )
     , parscan_check( arg_parscan_check )
-    , nvalue( arg_nvalue )
-    {}
+    , nvalue( arg_nvalue ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type & member )
-    {
-      const long end   = nvalue + 1 ;
-      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
-
-      if ( 0 < begin && future.is_null() ) {
-        if ( member.team_rank() == 0 ) {
-          future = policy.task_spawn
-            ( TestTaskTeam( policy ,
-                            parfor_result ,
-                            parreduce_check,
-                            parscan_result,
-                            parscan_check,
-                            begin - 1 )
-            , Kokkos::TaskTeam );
-
-          assert( ! future.is_null() );
-
-          policy.respawn( this , future );
-        }
-        return ;
-      }
+  void operator()( typename sched_type::member_type & member )
+  {
+    const long end   = nvalue + 1;
+    const long begin = 0 < end - SPAN ? end - SPAN : 0;
 
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { parfor_result[i] = i ; }
-                          );
-
-      // test parallel_reduce without join
-    
-      long tot = 0;
-      long expected = (begin+end-1)*(end-begin)*0.5;
-      
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &res) { res += parfor_result[i]; }
-                          , tot);
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { parreduce_check[i] = expected-tot ; }
-                          );
-
-      // test parallel_reduce with join
-
-      tot = 0;
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &res) { res += parfor_result[i]; }
-                          , [&]( long& val1, const long& val2) { val1 += val2; }
-                          , tot);
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { parreduce_check[i] += expected-tot ; }
-                          );
-
-      // test parallel_scan
-
-      // Exclusive scan
-      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &val , const bool final ) {
-                              if ( final ) { parscan_result[i] = val; }
-                              val += i;
-                            }
-                          );
+    if ( 0 < begin && future.is_null() ) {
       if ( member.team_rank() == 0 ) {
-        for ( long i = begin ; i < end ; ++i ) {
-          parscan_check[i] = (i*(i-1)-begin*(begin-1))*0.5-parscan_result[i];
-        }
+        future = Kokkos::task_spawn( Kokkos::TaskTeam( sched )
+                                   , TestTaskTeam( sched
+                                                 , parfor_result
+                                                 , parreduce_check
+                                                 , parscan_result
+                                                 , parscan_check
+                                                 , begin - 1 )
+                                   );
+
+        assert( !future.is_null() );
+
+        Kokkos::respawn( this, future );
       }
 
-      // Inclusive scan
-      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &val , const bool final ) {
-                              val += i;
-                              if ( final ) { parscan_result[i] = val; }
-                            }
-                          );
-      if ( member.team_rank() == 0 ) {
-        for ( long i = begin ; i < end ; ++i ) {
-          parscan_check[i] += (i*(i+1)-begin*(begin-1))*0.5-parscan_result[i];
-        }
+      return;
+    }
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parfor_result[i] = i; }
+                        );
+
+    // Test parallel_reduce without join.
+
+    long tot = 0;
+    long expected = ( begin + end - 1 ) * ( end - begin ) * 0.5;
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, begin, end )
+                           , [&] ( int i, long & res ) { res += parfor_result[i]; }
+                           , tot
+                           );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parreduce_check[i] = expected - tot; }
+                        );
+
+    // Test parallel_reduce with join.
+
+    tot = 0;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, begin, end )
+                           , [&] ( int i, long & res ) { res += parfor_result[i]; }
+#if 0
+                           , Kokkos::Sum( tot )
+#else
+                           , [] ( long & dst, const long & src ) { dst += src; }
+                           , tot
+#endif
+                           );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parreduce_check[i] += expected - tot; }
+                        );
+
+    // Test parallel_scan.
+
+    // Exclusive scan.
+    Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange( member, begin, end )
+                               , [&] ( int i, long & val, const bool final )
+    {
+      if ( final ) { parscan_result[i] = val; }
+
+      val += i;
+    });
+
+    // Wait for 'parscan_result' before testing it.
+    member.team_barrier();
+
+    if ( member.team_rank() == 0 ) {
+      for ( long i = begin; i < end; ++i ) {
+        parscan_check[i] = ( i * ( i - 1 ) - begin * ( begin - 1 ) ) * 0.5 - parscan_result[i];
       }
-      // ThreadVectorRange check
-      /*
-      long result = 0;
-      expected = (begin+end-1)*(end-begin)*0.5;
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member , 0 , 1 )
-                             , [&] ( const int i , long & outerUpdate ) {
-                                 long sum_j = 0.0;
-                                 Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( member , end - begin )
-                                                        , [&] ( const int j , long &innerUpdate ) {
-                                                            innerUpdate += begin+j;
-                                                          } , sum_j );
-                                 outerUpdate += sum_j ;
-                               } , result );
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) {
-                              parreduce_check[i] += result-expected ;
-                            }
-                          );
-      */
     }
 
-  static void run( long n )
+    // Don't overwrite 'parscan_result' until it has been tested.
+    member.team_barrier();
+
+    // Inclusive scan.
+    Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange( member, begin, end )
+                               , [&] ( int i, long & val, const bool final )
     {
-      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
-      // const unsigned memory_capacity = 100000 ; // fails with SPAN=1 for serial and OMP
-      const unsigned memory_capacity = 400000 ;
-
-      policy_type root_policy( typename policy_type::memory_space()
-                        , memory_capacity );
-
-      view_type   root_parfor_result("parfor_result",n+1);
-      view_type   root_parreduce_check("parreduce_check",n+1);
-      view_type   root_parscan_result("parscan_result",n+1);
-      view_type   root_parscan_check("parscan_check",n+1);
-
-      typename view_type::HostMirror
-        host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
-      typename view_type::HostMirror
-        host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
-      typename view_type::HostMirror
-        host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
-      typename view_type::HostMirror
-        host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
-
-      future_type f = root_policy.host_spawn(
-                        TestTaskTeam( root_policy ,
-                                      root_parfor_result ,
-                                      root_parreduce_check ,
-                                      root_parscan_result,
-                                      root_parscan_check,
-                                      n ) ,
-                        Kokkos::TaskTeam );
-
-      Kokkos::wait( root_policy );
-
-      Kokkos::deep_copy( host_parfor_result , root_parfor_result );
-      Kokkos::deep_copy( host_parreduce_check , root_parreduce_check );
-      Kokkos::deep_copy( host_parscan_result , root_parscan_result );
-      Kokkos::deep_copy( host_parscan_check , root_parscan_check );
-
-      for ( long i = 0 ; i <= n ; ++i ) {
-        const long answer = i ;
-        if ( host_parfor_result(i) != answer ) {
-          std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
-                    << host_parfor_result(i) << " != " << answer << std::endl ;
-        }
-        if ( host_parreduce_check(i) != 0 ) {
-          std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
-                    << host_parreduce_check(i) << " != 0" << std::endl ;
-        }
-        if ( host_parscan_check(i) != 0 ) {
-          std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
-                    << host_parscan_check(i) << " != 0" << std::endl ;
-        }
+      val += i;
+
+      if ( final ) { parscan_result[i] = val; }
+    });
+
+    // Wait for 'parscan_result' before testing it.
+    member.team_barrier();
+
+    if ( member.team_rank() == 0 ) {
+      for ( long i = begin; i < end; ++i ) {
+        parscan_check[i] += ( i * ( i + 1 ) - begin * ( begin - 1 ) ) * 0.5 - parscan_result[i];
       }
     }
+
+    // ThreadVectorRange check.
+/*
+    long result = 0;
+    expected = ( begin + end - 1 ) * ( end - begin ) * 0.5;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, 0, 1 )
+                           , [&] ( const int i, long & outerUpdate )
+    {
+      long sum_j = 0.0;
+
+      Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( member, end - begin )
+                             , [&] ( const int j, long & innerUpdate )
+      {
+        innerUpdate += begin + j;
+      }, sum_j );
+
+      outerUpdate += sum_j;
+    }, result );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i )
+    {
+      parreduce_check[i] += result - expected;
+    });
+*/
+  }
+
+  static void run( long n )
+  {
+    //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
+    //const unsigned memory_capacity = 100000; // Fails with SPAN=1 for serial and OMP.
+    const unsigned memory_capacity = 400000;
+
+    sched_type root_sched( typename sched_type::memory_space(), memory_capacity );
+
+    view_type root_parfor_result( "parfor_result", n + 1 );
+    view_type root_parreduce_check( "parreduce_check", n + 1 );
+    view_type root_parscan_result( "parscan_result", n + 1 );
+    view_type root_parscan_check( "parscan_check", n + 1 );
+
+    typename view_type::HostMirror
+      host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
+    typename view_type::HostMirror
+      host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
+    typename view_type::HostMirror
+      host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
+    typename view_type::HostMirror
+      host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
+
+    future_type f = Kokkos::host_spawn( Kokkos::TaskTeam( root_sched )
+                                      , TestTaskTeam( root_sched
+                                                    , root_parfor_result
+                                                    , root_parreduce_check
+                                                    , root_parscan_result
+                                                    , root_parscan_check
+                                                    , n )
+                                      );
+
+    Kokkos::wait( root_sched );
+
+    Kokkos::deep_copy( host_parfor_result, root_parfor_result );
+    Kokkos::deep_copy( host_parreduce_check, root_parreduce_check );
+    Kokkos::deep_copy( host_parscan_result, root_parscan_result );
+    Kokkos::deep_copy( host_parscan_check, root_parscan_check );
+
+    for ( long i = 0; i <= n; ++i ) {
+      const long answer = i;
+
+      if ( host_parfor_result( i ) != answer ) {
+        std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
+                  << host_parfor_result( i ) << " != " << answer << std::endl;
+      }
+
+      if ( host_parreduce_check( i ) != 0 ) {
+        std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
+                  << host_parreduce_check( i ) << " != 0" << std::endl;
+      }
+
+      if ( host_parscan_check( i ) != 0 ) {
+        std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
+                  << host_parscan_check( i ) << " != 0" << std::endl;
+      }
+    }
+  }
 };
 
 template< class ExecSpace >
 struct TestTaskTeamValue {
-
   enum { SPAN = 8 };
 
-  typedef long value_type ;
-  typedef Kokkos::TaskScheduler<ExecSpace>         policy_type ;
-  typedef Kokkos::Future<value_type,ExecSpace>  future_type ;
-  typedef Kokkos::View<long*,ExecSpace>         view_type ;
+  typedef long                                     value_type;
+  typedef Kokkos::TaskScheduler< ExecSpace >       sched_type;
+  typedef Kokkos::Future< value_type, ExecSpace >  future_type;
+  typedef Kokkos::View< long*, ExecSpace >         view_type;
 
-  policy_type  policy ;
-  future_type  future ;
+  sched_type   sched;
+  future_type  future;
 
-  view_type  result ;
-  const long nvalue ;
+  view_type   result;
+  const long  nvalue;
 
   KOKKOS_INLINE_FUNCTION
-  TestTaskTeamValue( const policy_type & arg_policy
-                   , const view_type   & arg_result
-                   , const long          arg_nvalue )
-    : policy(arg_policy)
+  TestTaskTeamValue( const sched_type & arg_sched
+                   , const view_type  & arg_result
+                   , const long         arg_nvalue )
+    : sched( arg_sched )
     , future()
     , result( arg_result )
-    , nvalue( arg_nvalue )
-    {}
+    , nvalue( arg_nvalue ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type const & member
+  void operator()( typename sched_type::member_type const & member
                  , value_type & final )
-    {
-      const long end   = nvalue + 1 ;
-      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+  {
+    const long end   = nvalue + 1;
+    const long begin = 0 < end - SPAN ? end - SPAN : 0;
 
-      if ( 0 < begin && future.is_null() ) {
-        if ( member.team_rank() == 0 ) {
-
-          future = policy.task_spawn
-            ( TestTaskTeamValue( policy , result , begin - 1 )
-            , Kokkos::TaskTeam );
+    if ( 0 < begin && future.is_null() ) {
+      if ( member.team_rank() == 0 ) {
+        future = sched.task_spawn( TestTaskTeamValue( sched, result, begin - 1 )
+                                 , Kokkos::TaskTeam );
 
-          assert( ! future.is_null() );
+        assert( !future.is_null() );
 
-          policy.respawn( this , future );
-        }
-        return ;
+        sched.respawn( this , future );
       }
 
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { result[i] = i + 1 ; }
-                          );
+      return;
+    }
 
-      if ( member.team_rank() == 0 ) {
-        final = result[nvalue] ;
-      }
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { result[i] = i + 1; }
+                        );
 
-      Kokkos::memory_fence();
+    if ( member.team_rank() == 0 ) {
+      final = result[nvalue];
     }
 
+    Kokkos::memory_fence();
+  }
+
   static void run( long n )
-    {
-      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
-      const unsigned memory_capacity = 100000 ;
+  {
+    //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
+    const unsigned memory_capacity = 100000;
 
-      policy_type root_policy( typename policy_type::memory_space()
-                             , memory_capacity );
+    sched_type root_sched( typename sched_type::memory_space()
+                          , memory_capacity );
 
-      view_type   root_result("result",n+1);
+    view_type root_result( "result", n + 1 );
 
-      typename view_type::HostMirror
-        host_result = Kokkos::create_mirror_view( root_result );
+    typename view_type::HostMirror host_result = Kokkos::create_mirror_view( root_result );
 
-      future_type fv = root_policy.host_spawn
-        ( TestTaskTeamValue( root_policy, root_result, n ) , Kokkos::TaskTeam );
+    future_type fv = root_sched.host_spawn( TestTaskTeamValue( root_sched, root_result, n )
+                                          , Kokkos::TaskTeam );
 
-      Kokkos::wait( root_policy );
+    Kokkos::wait( root_sched );
 
-      Kokkos::deep_copy( host_result , root_result );
+    Kokkos::deep_copy( host_result, root_result );
 
-      if ( fv.get() != n + 1 ) {
-        std::cerr << "TestTaskTeamValue ERROR future = "
-                  << fv.get() << " != " << n + 1 << std::endl ;
-      }
-      for ( long i = 0 ; i <= n ; ++i ) {
-        const long answer = i + 1 ;
-        if ( host_result(i) != answer ) {
-          std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
-                    << host_result(i) << " != " << answer << std::endl ;
-        }
+    if ( fv.get() != n + 1 ) {
+      std::cerr << "TestTaskTeamValue ERROR future = "
+                << fv.get() << " != " << n + 1 << std::endl;
+    }
+
+    for ( long i = 0; i <= n; ++i ) {
+      const long answer = i + 1;
+
+      if ( host_result( i ) != answer ) {
+        std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
+                  << host_result( i ) << " != " << answer << std::endl;
       }
     }
+  }
 };
-} // namespace TestTaskScheduler
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
 
-#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP */
+} // namespace TestTaskScheduler
 
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
 
+#endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
index bcf4d3a173686ad8b1d14abc45ee957bb8650389..11a523921db9995c18d38ac5e18661244acd0ecb 100644
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -48,177 +48,169 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
+
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestTeamPolicy {
+  typedef typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type team_member;
+  typedef Kokkos::View< int**, ExecSpace > view_type;
 
-  typedef typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type team_member ;
-  typedef Kokkos::View<int**,ExecSpace> view_type ;
-
-  view_type m_flags ;
+  view_type m_flags;
 
   TestTeamPolicy( const size_t league_size )
-    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags")
-             , Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( *this )
-             , league_size )
-    {}
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ),
+               Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( *this ),
+               league_size ) {}
 
   struct VerifyInitTag {};
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const team_member & member ) const
-    {
-      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+  {
+    const int tid = member.team_rank() + member.team_size() * member.league_rank();
 
-      m_flags( member.team_rank() , member.league_rank() ) = tid ;
-    }
+    m_flags( member.team_rank(), member.league_rank() ) = tid;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const VerifyInitTag & , const team_member & member ) const
-    {
-      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+  void operator()( const VerifyInitTag &, const team_member & member ) const
+  {
+    const int tid = member.team_rank() + member.team_size() * member.league_rank();
 
-      if ( tid != m_flags( member.team_rank() , member.league_rank() ) ) {
-        printf("TestTeamPolicy member(%d,%d) error %d != %d\n"
-              , member.league_rank() , member.team_rank()
-              , tid , m_flags( member.team_rank() , member.league_rank() ) );
-      }
+    if ( tid != m_flags( member.team_rank(), member.league_rank() ) ) {
+      printf( "TestTeamPolicy member(%d,%d) error %d != %d\n",
+               member.league_rank(), member.team_rank(),
+               tid, m_flags( member.team_rank(), member.league_rank() ) );
     }
+  }
 
-  // included for test_small_league_size
-  TestTeamPolicy()
-    : m_flags()
-  {}
+  // Included for test_small_league_size.
+  TestTeamPolicy() : m_flags() {}
+
+  // Included for test_small_league_size.
+  struct NoOpTag {};
 
-  // included for test_small_league_size
-  struct NoOpTag {} ;
   KOKKOS_INLINE_FUNCTION
-  void operator()( const NoOpTag & , const team_member & member ) const
-    {}
+  void operator()( const NoOpTag &, const team_member & member ) const {}
 
 
   static void test_small_league_size() {
-
     int bs = 8; // batch size (number of elements per batch)
     int ns = 16; // total number of "problems" to process
 
-    // calculate total scratch memory space size
+    // Calculate total scratch memory space size.
     const int level = 0;
     int mem_size = 960;
-    const int num_teams = ns/bs;
-    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy(num_teams, Kokkos::AUTO());
+    const int num_teams = ns / bs;
+    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy( num_teams, Kokkos::AUTO() );
 
-    Kokkos::parallel_for ( policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), Kokkos::PerThread(0))
-                         , TestTeamPolicy()
-                         );
+    Kokkos::parallel_for( policy.set_scratch_size( level, Kokkos::PerTeam( mem_size ), Kokkos::PerThread( 0 ) ),
+                          TestTeamPolicy() );
   }
 
   static void test_for( const size_t league_size )
-    {
-      TestTeamPolicy functor( league_size );
+  {
+    TestTeamPolicy functor( league_size );
 
-      const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+    const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
 
-      Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor );
-      Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , VerifyInitTag >( league_size , team_size ) , functor );
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size, team_size ), functor );
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace, VerifyInitTag >( league_size, team_size ), functor );
 
-      test_small_league_size();
-    }
+    test_small_league_size();
+  }
 
   struct ReduceTag {};
 
-  typedef long value_type ;
+  typedef long value_type;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const team_member & member , value_type & update ) const
-    {
-      update += member.team_rank() + member.team_size() * member.league_rank();
-    }
+  void operator()( const team_member & member, value_type & update ) const
+  {
+    update += member.team_rank() + member.team_size() * member.league_rank();
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const ReduceTag & , const team_member & member , value_type & update ) const
-    {
-      update += 1 + member.team_rank() + member.team_size() * member.league_rank();
-    }
+  void operator()( const ReduceTag &, const team_member & member, value_type & update ) const
+  {
+    update += 1 + member.team_rank() + member.team_size() * member.league_rank();
+  }
 
   static void test_reduce( const size_t league_size )
-    {
-      TestTeamPolicy functor( league_size );
+  {
+    TestTeamPolicy functor( league_size );
 
-      const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
-      const long N = team_size * league_size ;
+    const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+    const long N = team_size * league_size;
 
-      long total = 0 ;
+    long total = 0;
 
-      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor , total );
-      ASSERT_EQ( size_t((N-1)*(N))/2 , size_t(total) );
+    Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size, team_size ), functor, total );
+    ASSERT_EQ( size_t( ( N - 1 ) * ( N ) ) / 2, size_t( total ) );
 
-      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , ReduceTag >( league_size , team_size ) , functor , total );
-      ASSERT_EQ( (size_t(N)*size_t(N+1))/2 , size_t(total) );
-    }
+    Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace, ReduceTag >( league_size, team_size ), functor, total );
+    ASSERT_EQ( ( size_t( N ) * size_t( N + 1 ) ) / 2, size_t( total ) );
+  }
 };
 
-}
-}
+} // namespace
+
+} // namespace Test
 
 /*--------------------------------------------------------------------------*/
 
 namespace Test {
 
-template< typename ScalarType , class DeviceType, class ScheduleType >
+template< typename ScalarType, class DeviceType, class ScheduleType >
 class ReduceTeamFunctor
 {
 public:
-  typedef DeviceType execution_space ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
-  typedef typename execution_space::size_type        size_type ;
+  typedef DeviceType                                           execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType, execution_space >  policy_type;
+  typedef typename execution_space::size_type                  size_type;
 
   struct value_type {
-    ScalarType value[3] ;
+    ScalarType value[3];
   };
 
-  const size_type nwork ;
+  const size_type nwork;
 
   ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
 
-  ReduceTeamFunctor( const ReduceTeamFunctor & rhs )
-    : nwork( rhs.nwork ) {}
+  ReduceTeamFunctor( const ReduceTeamFunctor & rhs ) : nwork( rhs.nwork ) {}
 
   KOKKOS_INLINE_FUNCTION
   void init( value_type & dst ) const
   {
-    dst.value[0] = 0 ;
-    dst.value[1] = 0 ;
-    dst.value[2] = 0 ;
+    dst.value[0] = 0;
+    dst.value[1] = 0;
+    dst.value[2] = 0;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile value_type & dst ,
-             const volatile value_type & src ) const
+  void join( volatile value_type & dst, const volatile value_type & src ) const
   {
-    dst.value[0] += src.value[0] ;
-    dst.value[1] += src.value[1] ;
-    dst.value[2] += src.value[2] ;
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type ind , value_type & dst ) const
+  void operator()( const typename policy_type::member_type ind, value_type & dst ) const
   {
     const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
     const int thread_size = ind.team_size() * ind.league_size();
-    const int chunk = ( nwork + thread_size - 1 ) / thread_size ;
+    const int chunk = ( nwork + thread_size - 1 ) / thread_size;
 
-    size_type iwork = chunk * thread_rank ;
-    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork ;
+    size_type iwork = chunk * thread_rank;
+    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
 
-    for ( ; iwork < iwork_end ; ++iwork ) {
-      dst.value[0] += 1 ;
-      dst.value[1] += iwork + 1 ;
-      dst.value[2] += nwork - iwork ;
+    for ( ; iwork < iwork_end; ++iwork ) {
+      dst.value[0] += 1;
+      dst.value[1] += iwork + 1;
+      dst.value[2] += nwork - iwork;
     }
   }
 };
@@ -227,58 +219,53 @@ public:
 
 namespace {
 
-template< typename ScalarType , class DeviceType, class ScheduleType >
+template< typename ScalarType, class DeviceType, class ScheduleType >
 class TestReduceTeam
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
-  typedef typename execution_space::size_type    size_type ;
-
-  //------------------------------------
+  typedef DeviceType                                            execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef typename execution_space::size_type                   size_type;
 
-  TestReduceTeam( const size_type & nwork )
-  {
-    run_test(nwork);
-  }
+  TestReduceTeam( const size_type & nwork ) { run_test( nwork ); }
 
   void run_test( const size_type & nwork )
   {
-    typedef Test::ReduceTeamFunctor< ScalarType , execution_space , ScheduleType> functor_type ;
-    typedef typename functor_type::value_type value_type ;
-    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type ;
+    typedef Test::ReduceTeamFunctor< ScalarType, execution_space, ScheduleType> functor_type;
+    typedef typename functor_type::value_type value_type;
+    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
     value_type result[ Repeat ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    const unsigned team_size   = policy_type::team_size_recommended( functor_type(nwork) );
-    const unsigned league_size = ( nwork + team_size - 1 ) / team_size ;
+    const unsigned team_size   = policy_type::team_size_recommended( functor_type( nwork ) );
+    const unsigned league_size = ( nwork + team_size - 1 ) / team_size;
 
-    policy_type team_exec( league_size , team_size );
+    policy_type team_exec( league_size, team_size );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+    for ( unsigned i = 0; i < Repeat; ++i ) {
       result_type tmp( & result[i] );
-      Kokkos::parallel_reduce( team_exec , functor_type(nwork) , tmp );
+      Kokkos::parallel_reduce( team_exec, functor_type( nwork ), tmp );
     }
 
     execution_space::fence();
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
       }
     }
   }
 };
 
-}
+} // namespace
 
 /*--------------------------------------------------------------------------*/
 
@@ -288,53 +275,51 @@ template< class DeviceType, class ScheduleType >
 class ScanTeamFunctor
 {
 public:
-  typedef DeviceType  execution_space ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef DeviceType                                            execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef long int                                              value_type;
 
-  typedef long int    value_type ;
-  Kokkos::View< value_type , execution_space > accum ;
-  Kokkos::View< value_type , execution_space > total ;
+  Kokkos::View< value_type, execution_space > accum;
+  Kokkos::View< value_type, execution_space > total;
 
-  ScanTeamFunctor() : accum("accum"), total("total") {}
+  ScanTeamFunctor() : accum( "accum" ), total( "total" ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init( value_type & error ) const { error = 0 ; }
+  void init( value_type & error ) const { error = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( value_type volatile & error ,
-             value_type volatile const & input ) const
-    { if ( input ) error = 1 ; }
+  void join( value_type volatile & error, value_type volatile const & input ) const
+  { if ( input ) error = 1; }
 
   struct JoinMax {
-    typedef long int value_type ;
+    typedef long int value_type;
+
     KOKKOS_INLINE_FUNCTION
-    void join( value_type volatile & dst
-             , value_type volatile const & input ) const
-      { if ( dst < input ) dst = input ; }
+    void join( value_type volatile & dst, value_type volatile const & input ) const
+    { if ( dst < input ) dst = input; }
   };
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type ind , value_type & error ) const
+  void operator()( const typename policy_type::member_type ind, value_type & error ) const
   {
     if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
       const long int thread_count = ind.league_size() * ind.team_size();
-      total() = ( thread_count * ( thread_count + 1 ) ) / 2 ;
+      total() = ( thread_count * ( thread_count + 1 ) ) / 2;
     }
 
     // Team max:
-    const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() );
+    const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ), JoinMax() );
 
     if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
-      printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n"
-            , ind.league_rank(), ind.team_rank()
-            , ind.league_size(), ind.team_size()
-            , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m );
+      printf( "ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n",
+               ind.league_rank(), ind.team_rank(),
+               ind.league_size(), ind.team_size(),
+               (long int) ( ind.league_rank() + ( ind.team_size() - 1 ) ), m );
     }
 
     // Scan:
     const long int answer =
-      ( ind.league_rank() + 1 ) * ind.team_rank() +
-      ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ;
+      ( ind.league_rank() + 1 ) * ind.team_rank() + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2;
 
     const long int result =
       ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
@@ -343,16 +328,17 @@ public:
       ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
 
     if ( answer != result || answer != result2 ) {
-      printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
-             ind.league_rank(), ind.team_rank(),
-             ind.league_size(), ind.team_size(),
-             answer,result,result2);
-      error = 1 ;
+      printf( "ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
+              ind.league_rank(), ind.team_rank(),
+              ind.league_size(), ind.team_size(),
+              answer, result, result2 );
+
+      error = 1;
     }
 
     const long int thread_rank = ind.team_rank() +
                                  ind.team_size() * ind.league_rank();
-    ind.team_scan( 1 + thread_rank , accum.ptr_on_device() );
+    ind.team_scan( 1 + thread_rank, accum.ptr_on_device() );
   }
 };
 
@@ -360,47 +346,45 @@ template< class DeviceType, class ScheduleType >
 class TestScanTeam
 {
 public:
-  typedef DeviceType  execution_space ;
-  typedef long int    value_type ;
-
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space > policy_type ;
-  typedef Test::ScanTeamFunctor<DeviceType, ScheduleType> functor_type ;
+  typedef DeviceType                                            execution_space;
+  typedef long int                                              value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef Test::ScanTeamFunctor<DeviceType, ScheduleType>       functor_type;
 
-  //------------------------------------
-
-  TestScanTeam( const size_t nteam )
-  {
-    run_test(nteam);
-  }
+  TestScanTeam( const size_t nteam ) { run_test( nteam ); }
 
   void run_test( const size_t nteam )
   {
-    typedef Kokkos::View< long int , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
-    const unsigned REPEAT = 100000 ;
+    typedef Kokkos::View< long int, Kokkos::HostSpace, Kokkos::MemoryUnmanaged >  result_type;
+
+    const unsigned REPEAT = 100000;
     unsigned Repeat;
-    if ( nteam == 0 )
-    {
+
+    if ( nteam == 0 ) {
       Repeat = 1;
-    } else {
-      Repeat = ( REPEAT + nteam - 1 ) / nteam ; //error here
     }
+    else {
+      Repeat = ( REPEAT + nteam - 1 ) / nteam; // Error here.
+    }
+
+    functor_type functor;
 
-    functor_type functor ;
+    policy_type team_exec( nteam, policy_type::team_size_max( functor ) );
 
-    policy_type team_exec( nteam , policy_type::team_size_max( functor ) );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      long int accum = 0;
+      long int total = 0;
+      long int error = 0;
+      Kokkos::deep_copy( functor.accum, total );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      long int accum = 0 ;
-      long int total = 0 ;
-      long int error = 0 ;
-      Kokkos::deep_copy( functor.accum , total );
-      Kokkos::parallel_reduce( team_exec , functor , result_type( & error ) );
+      Kokkos::parallel_reduce( team_exec, functor, result_type( & error ) );
       DeviceType::fence();
-      Kokkos::deep_copy( accum , functor.accum );
-      Kokkos::deep_copy( total , functor.total );
 
-      ASSERT_EQ( error , 0 );
-      ASSERT_EQ( total , accum );
+      Kokkos::deep_copy( accum, functor.accum );
+      Kokkos::deep_copy( total, functor.total );
+
+      ASSERT_EQ( error, 0 );
+      ASSERT_EQ( total, accum );
     }
 
     execution_space::fence();
@@ -416,18 +400,18 @@ namespace Test {
 template< class ExecSpace, class ScheduleType >
 struct SharedTeamFunctor {
 
-  typedef ExecSpace  execution_space ;
-  typedef int        value_type ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef ExecSpace                                             execution_space;
+  typedef int                                                   value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
 
   enum { SHARED_COUNT = 1000 };
 
-  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+  typedef typename ExecSpace::scratch_memory_space  shmem_space;
 
-  // tbd: MemoryUnmanaged should be the default for shared memory space
-  typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+  // TBD: MemoryUnmanaged should be the default for shared memory space.
+  typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
 
-  // Tell how much shared memory will be required by this functor:
+  // Tell how much shared memory will be required by this functor.
   inline
   unsigned team_shmem_size( int team_size ) const
   {
@@ -436,19 +420,26 @@ struct SharedTeamFunctor {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  void operator()( const typename policy_type::member_type & ind, value_type & update ) const
   {
-    const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
-    const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
-
-    if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
-        (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
-      printf ("Failed to allocate shared memory of size %lu\n",
-              static_cast<unsigned long> (SHARED_COUNT));
-      ++update; // failure to allocate is an error
+    const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
+    const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
+
+    if ( ( shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0 ) ||
+         ( shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0 ) )
+    {
+      printf ("member( %d/%d , %d/%d ) Failed to allocate shared memory of size %lu\n"
+             , ind.league_rank()
+             , ind.league_size()
+             , ind.team_rank()
+             , ind.team_size()
+             , static_cast<unsigned long>( SHARED_COUNT )
+             );
+
+      ++update; // Failure to allocate is an error.
     }
     else {
-      for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+      for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
         shared_A[i] = i + ind.league_rank();
         shared_B[i] = 2 * i + ind.league_rank();
       }
@@ -456,12 +447,13 @@ struct SharedTeamFunctor {
       ind.team_barrier();
 
       if ( ind.team_rank() + 1 == ind.team_size() ) {
-        for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+        for ( int i = 0; i < SHARED_COUNT; ++i ) {
           if ( shared_A[i] != i + ind.league_rank() ) {
-            ++update ;
+            ++update;
           }
+
           if ( shared_B[i] != 2 * i + ind.league_rank() ) {
-            ++update ;
+            ++update;
           }
         }
       }
@@ -469,78 +461,79 @@ struct SharedTeamFunctor {
   }
 };
 
-}
+} // namespace Test
 
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestSharedTeam {
-
-  TestSharedTeam()
-  { run(); }
+  TestSharedTeam() { run(); }
 
   void run()
   {
-    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
-    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor;
+    typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
 
-    const size_t team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( Functor() );
+    const size_t team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( Functor() );
 
-    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size );
+    Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size, team_size );
 
-    typename Functor::value_type error_count = 0 ;
+    typename Functor::value_type error_count = 0;
 
-    Kokkos::parallel_reduce( team_exec , Functor() , result_type( & error_count ) );
+    Kokkos::parallel_reduce( team_exec, Functor(), result_type( & error_count ) );
 
-    ASSERT_EQ( error_count , 0 );
+    ASSERT_EQ( error_count, 0 );
   }
 };
-}
+
+} // namespace
 
 namespace Test {
 
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
 template< class MemorySpace, class ExecSpace, class ScheduleType >
 struct TestLambdaSharedTeam {
-
-  TestLambdaSharedTeam()
-  { run(); }
+  TestLambdaSharedTeam() { run(); }
 
   void run()
   {
-    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
-    //typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
-    typedef Kokkos::View< typename Functor::value_type , MemorySpace, Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Test::SharedTeamFunctor< ExecSpace, ScheduleType > Functor;
+    //typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
+    typedef Kokkos::View< typename Functor::value_type, MemorySpace, Kokkos::MemoryUnmanaged > result_type;
 
-    typedef typename ExecSpace::scratch_memory_space shmem_space ;
+    typedef typename ExecSpace::scratch_memory_space shmem_space;
 
-    // tbd: MemoryUnmanaged should be the default for shared memory space
-    typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+    // TBD: MemoryUnmanaged should be the default for shared memory space.
+    typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
 
     const int SHARED_COUNT = 1000;
     int team_size = 1;
+
 #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<ExecSpace,Kokkos::Cuda>::value)
-      team_size = 128;
+    if ( std::is_same< ExecSpace, Kokkos::Cuda >::value ) team_size = 128;
 #endif
-    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size);
-    team_exec = team_exec.set_scratch_size(0,Kokkos::PerTeam(SHARED_COUNT*2*sizeof(int)));
 
-    typename Functor::value_type error_count = 0 ;
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size, team_size );
+    team_exec = team_exec.set_scratch_size( 0, Kokkos::PerTeam( SHARED_COUNT * 2 * sizeof( int ) ) );
+
+    typename Functor::value_type error_count = 0;
 
-    Kokkos::parallel_reduce( team_exec , KOKKOS_LAMBDA
-        ( const typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type & ind , int & update ) {
+    Kokkos::parallel_reduce( team_exec, KOKKOS_LAMBDA
+        ( const typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type & ind, int & update )
+    {
+      const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
+      const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
 
-      const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
-      const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
+      if ( ( shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0 ) ||
+           ( shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0 ) )
+      {
+        printf( "Failed to allocate shared memory of size %lu\n",
+                static_cast<unsigned long>( SHARED_COUNT ) );
 
-      if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
-          (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
-        printf ("Failed to allocate shared memory of size %lu\n",
-                static_cast<unsigned long> (SHARED_COUNT));
-        ++update; // failure to allocate is an error
-      } else {
-        for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+        ++update; // Failure to allocate is an error.
+      }
+      else {
+        for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
           shared_A[i] = i + ind.league_rank();
           shared_B[i] = 2 * i + ind.league_rank();
         }
@@ -548,196 +541,213 @@ struct TestLambdaSharedTeam {
         ind.team_barrier();
 
         if ( ind.team_rank() + 1 == ind.team_size() ) {
-          for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+          for ( int i = 0; i < SHARED_COUNT; ++i ) {
             if ( shared_A[i] != i + ind.league_rank() ) {
-              ++update ;
+              ++update;
             }
+
             if ( shared_B[i] != 2 * i + ind.league_rank() ) {
-              ++update ;
+              ++update;
             }
           }
         }
       }
     }, result_type( & error_count ) );
 
-    ASSERT_EQ( error_count , 0 );
+    ASSERT_EQ( error_count, 0 );
   }
 };
 #endif
-}
+
+} // namespace Test
 
 namespace Test {
 
 template< class ExecSpace, class ScheduleType >
 struct ScratchTeamFunctor {
 
-  typedef ExecSpace  execution_space ;
-  typedef int        value_type ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef ExecSpace                                            execution_space;
+  typedef int                                                  value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType, execution_space >  policy_type;
 
   enum { SHARED_TEAM_COUNT = 100 };
   enum { SHARED_THREAD_COUNT = 10 };
 
-  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+  typedef typename ExecSpace::scratch_memory_space shmem_space;
 
-  // tbd: MemoryUnmanaged should be the default for shared memory space
-  typedef Kokkos::View<size_t*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+  // TBD: MemoryUnmanaged should be the default for shared memory space.
+  typedef Kokkos::View< size_t*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  void operator()( const typename policy_type::member_type & ind, value_type & update ) const
   {
-    const shared_int_array_type scratch_ptr( ind.team_scratch(1) , 3*ind.team_size() );
-    const shared_int_array_type scratch_A( ind.team_scratch(1) , SHARED_TEAM_COUNT );
-    const shared_int_array_type scratch_B( ind.thread_scratch(1) , SHARED_THREAD_COUNT );
-
-    if ((scratch_ptr.ptr_on_device () == NULL ) ||
-        (scratch_A.  ptr_on_device () == NULL && SHARED_TEAM_COUNT > 0) ||
-        (scratch_B.  ptr_on_device () == NULL && SHARED_THREAD_COUNT > 0)) {
-      printf ("Failed to allocate shared memory of size %lu\n",
-              static_cast<unsigned long> (SHARED_TEAM_COUNT));
-      ++update; // failure to allocate is an error
+    const shared_int_array_type scratch_ptr( ind.team_scratch( 1 ), 3 * ind.team_size() );
+    const shared_int_array_type scratch_A( ind.team_scratch( 1 ), SHARED_TEAM_COUNT );
+    const shared_int_array_type scratch_B( ind.thread_scratch( 1 ), SHARED_THREAD_COUNT );
+
+    if ( ( scratch_ptr.ptr_on_device () == NULL ) ||
+         ( scratch_A.  ptr_on_device () == NULL && SHARED_TEAM_COUNT > 0 ) ||
+         ( scratch_B.  ptr_on_device () == NULL && SHARED_THREAD_COUNT > 0 ) )
+    {
+      printf( "Failed to allocate shared memory of size %lu\n",
+              static_cast<unsigned long>( SHARED_TEAM_COUNT ) );
+
+      ++update; // Failure to allocate is an error.
     }
     else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(ind,0,(int)SHARED_TEAM_COUNT),[&] (const int &i) {
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( ind, 0, (int) SHARED_TEAM_COUNT ), [&] ( const int & i ) {
         scratch_A[i] = i + ind.league_rank();
       });
-      for(int i=0; i<SHARED_THREAD_COUNT; i++)
-        scratch_B[i] = 10000*ind.league_rank() + 100*ind.team_rank() + i;
+
+      for ( int i = 0; i < SHARED_THREAD_COUNT; i++ ) {
+        scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
+      }
 
       scratch_ptr[ind.team_rank()] = (size_t) scratch_A.ptr_on_device();
       scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t) scratch_B.ptr_on_device();
 
       ind.team_barrier();
 
-      for( int i = 0; i<SHARED_TEAM_COUNT; i++) {
-        if(scratch_A[i] != size_t(i + ind.league_rank()))
-          ++update;
+      for ( int i = 0; i < SHARED_TEAM_COUNT; i++ ) {
+        if ( scratch_A[i] != size_t( i + ind.league_rank() ) ) ++update;
       }
-      for( int i = 0; i < ind.team_size(); i++) {
-        if(scratch_ptr[0]!=scratch_ptr[i]) ++update;
+
+      for ( int i = 0; i < ind.team_size(); i++ ) {
+        if ( scratch_ptr[0] != scratch_ptr[i] ) ++update;
       }
-      if(scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
-         SHARED_THREAD_COUNT*sizeof(size_t))
+
+      if ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] < SHARED_THREAD_COUNT * sizeof( size_t ) ) {
         ++update;
-      for( int i = 1; i < ind.team_size(); i++) {
-        if((scratch_ptr[i+ind.team_size()] - scratch_ptr[i-1+ind.team_size()]) !=
-           (scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()])) ++update;
+      }
 
+      for ( int i = 1; i < ind.team_size(); i++ ) {
+        if ( ( scratch_ptr[i + ind.team_size()] - scratch_ptr[i - 1 + ind.team_size()] ) !=
+             ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] ) )
+        {
+          ++update;
+        }
       }
     }
   }
 };
 
-}
+} // namespace Test
 
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestScratchTeam {
-
-  TestScratchTeam()
-  { run(); }
+  TestScratchTeam() { run(); }
 
   void run()
   {
-    typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor ;
-    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor;
+    typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged >  result_type;
 
     const size_t team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( Functor() );
 
-    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size );
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size, team_size );
+
+    typename Functor::value_type error_count = 0;
+
+    int team_scratch_size   = Functor::shared_int_array_type::shmem_size( Functor::SHARED_TEAM_COUNT ) +
+                              Functor::shared_int_array_type::shmem_size( 3 * team_size );
 
-    typename Functor::value_type error_count = 0 ;
+    int thread_scratch_size = Functor::shared_int_array_type::shmem_size( Functor::SHARED_THREAD_COUNT );
 
-    int team_scratch_size   = Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
-                              Functor::shared_int_array_type::shmem_size(3*team_size);
-    int thread_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_THREAD_COUNT);
-    Kokkos::parallel_reduce( team_exec.set_scratch_size(0,Kokkos::PerTeam(team_scratch_size),
-                                                          Kokkos::PerThread(thread_scratch_size)) ,
-                             Functor() , result_type( & error_count ) );
+    Kokkos::parallel_reduce( team_exec.set_scratch_size( 0, Kokkos::PerTeam( team_scratch_size ),
+                                                         Kokkos::PerThread( thread_scratch_size ) ),
+                             Functor(), result_type( & error_count ) );
 
-    ASSERT_EQ( error_count , 0 );
+    ASSERT_EQ( error_count, 0 );
   }
 };
-}
+
+} // namespace
 
 namespace Test {
-template< class ExecSpace>
+
+template< class ExecSpace >
 KOKKOS_INLINE_FUNCTION
-int test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team1(team.team_scratch(0),128);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread1(team.thread_scratch(0),16);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team2(team.team_scratch(0),128);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread2(team.thread_scratch(0),16);
-
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team1(team.team_scratch(1),128000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread1(team.thread_scratch(1),16000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team2(team.team_scratch(1),128000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread2(team.thread_scratch(1),16000);
-
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team3(team.team_scratch(0),128);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread3(team.thread_scratch(0),16);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team3(team.team_scratch(1),128000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread3(team.thread_scratch(1),16000);
+int test_team_mulit_level_scratch_loop_body( const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team ) {
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team1( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread1( team.thread_scratch( 0 ), 16 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team2( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread2( team.thread_scratch( 0 ), 16 );
+
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team1( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread1( team.thread_scratch( 1 ), 16000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team2( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread2( team.thread_scratch( 1 ), 16000 );
+
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team3( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread3( team.thread_scratch( 0 ), 16 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team3( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread3( team.thread_scratch( 1 ), 16000 );
 
   // The explicit types for 0 and 128 are here to test TeamThreadRange accepting different
   // types for begin and end.
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,int(0),unsigned(128)), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, int( 0 ), unsigned( 128 ) ), [&] ( const int & i )
   {
-    a_team1(i) = 1000000 + i;
-    a_team2(i) = 2000000 + i;
-    a_team3(i) = 3000000 + i;
+    a_team1( i ) = 1000000 + i + team.league_rank() * 100000;
+    a_team2( i ) = 2000000 + i + team.league_rank() * 100000;
+    a_team3( i ) = 3000000 + i + team.league_rank() * 100000;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
   {
-    a_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
-    a_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
-    a_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+    a_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    a_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    a_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
   });
 
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
   {
-    b_team1(i) = 1000000 + i;
-    b_team2(i) = 2000000 + i;
-    b_team3(i) = 3000000 + i;
+    b_team1( i ) = 1000000 + i + team.league_rank() * 100000;
+    b_team2( i ) = 2000000 + i + team.league_rank() * 100000;
+    b_team3( i ) = 3000000 + i + team.league_rank() * 100000;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
   {
-    b_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
-    b_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
-    b_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+    b_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    b_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    b_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
   });
 
   team.team_barrier();
+
   int error = 0;
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128 ), [&] ( const int & i )
   {
-    if(a_team1(i) != 1000000 + i) error++;
-    if(a_team2(i) != 2000000 + i) error++;
-    if(a_team3(i) != 3000000 + i) error++;
+    if ( a_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
+    if ( a_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
+    if ( a_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
   {
-    if(a_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
-    if(a_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
-    if(a_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+    if ( a_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( a_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( a_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
   });
 
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
   {
-    if(b_team1(i) != 1000000 + i) error++;
-    if(b_team2(i) != 2000000 + i) error++;
-    if(b_team3(i) != 3000000 + i) error++;
+    if ( b_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
+    if ( b_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
+    if ( b_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
   {
-    if(b_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
-    if(b_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
-    if( b_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+    if ( b_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( b_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( b_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
   });
 
   return error;
@@ -748,93 +758,107 @@ struct TagFor {};
 
 template< class ExecSpace, class ScheduleType >
 struct ClassNoShmemSizeFunction {
-  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
+
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
-    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator()( const TagFor &, const member_type & team ) const {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
     errors() += error;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
-    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator() ( const TagReduce &, const member_type & team, int & error ) const {
+    error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
   }
 
   void run() {
-    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
     errors = d_errors;
 
-    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
-    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+    const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+    const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+
+    const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+    const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
 
-    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
-    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
     {
-    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this);
-    Kokkos::fence();
-    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors = Kokkos::create_mirror_view(d_errors);
-    Kokkos::deep_copy(h_errors,d_errors);
-    ASSERT_EQ(h_errors(),0);
+      Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this );
+      Kokkos::fence();
+
+      typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
+      Kokkos::deep_copy( h_errors, d_errors );
+      ASSERT_EQ( h_errors(), 0 );
     }
 
     {
-    int error = 0;
-    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this,error);
-    Kokkos::fence();
-    ASSERT_EQ(error,0);
+      int error = 0;
+      Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this, error );
+      Kokkos::fence();
+
+      ASSERT_EQ( error, 0 );
     }
   };
 };
 
 template< class ExecSpace, class ScheduleType >
 struct ClassWithShmemSizeFunction {
-  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
+
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
-    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator()( const TagFor &, const member_type & team ) const {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
     errors() += error;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
-    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator() ( const TagReduce &, const member_type & team, int & error ) const {
+    error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
   }
 
   void run() {
-    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
     errors = d_errors;
 
-    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
-    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+    const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+    const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
+
     {
-    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_for(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this);
-    Kokkos::fence();
-    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(d_errors);
-    Kokkos::deep_copy(h_errors,d_errors);
-    ASSERT_EQ(h_errors(),0);
+      Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_for( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
+                                                     Kokkos::PerThread( per_thread1 ) ),
+                            *this );
+      Kokkos::fence();
+
+      typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
+      Kokkos::deep_copy( h_errors, d_errors );
+      ASSERT_EQ( h_errors(), 0 );
     }
 
     {
-    int error = 0;
-    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_reduce(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this,error);
-    Kokkos::fence();
-    ASSERT_EQ(error,0);
+      int error = 0;
+      Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_reduce( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
+                                                        Kokkos::PerThread( per_thread1 ) ),
+                               *this, error );
+      Kokkos::fence();
+
+      ASSERT_EQ( error, 0 );
     }
   };
 
-  unsigned team_shmem_size(int team_size) const {
-    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
-    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+  unsigned team_shmem_size( int team_size ) const {
+    const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+    const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
     return per_team0 + team_size * per_thread0;
   }
 };
@@ -842,67 +866,68 @@ struct ClassWithShmemSizeFunction {
 template< class ExecSpace, class ScheduleType >
 void test_team_mulit_level_scratch_test_lambda() {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
-  Kokkos::View<int,ExecSpace> d_errors("Errors");
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View< int, ExecSpace > d_errors( "Errors" );
   errors = d_errors;
 
-  const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
-  const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+  const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+  const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+
+  const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+  const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
 
-  const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
-  const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+  Kokkos::TeamPolicy< ExecSpace, ScheduleType > policy( 10, 8, 16 );
 
-  Kokkos::TeamPolicy<ExecSpace,ScheduleType> policy(10,8,16);
-  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
-    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
+                        KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team )
+  {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
     errors() += error;
   });
   Kokkos::fence();
-  typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(errors);
-  Kokkos::deep_copy(h_errors,d_errors);
-  ASSERT_EQ(h_errors(),0);
+
+  typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( errors );
+  Kokkos::deep_copy( h_errors, d_errors );
+  ASSERT_EQ( h_errors(), 0 );
 
   int error = 0;
-  Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team, int& count) {
-      count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
-  },error);
-  ASSERT_EQ(error,0);
+  Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
+                           KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team, int & count )
+  {
+    count += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+  }, error );
+  ASSERT_EQ( error, 0 );
   Kokkos::fence();
 #endif
 }
 
-
-}
+} // namespace Test
 
 namespace {
+
 template< class ExecSpace, class ScheduleType >
 struct TestMultiLevelScratchTeam {
-
-  TestMultiLevelScratchTeam()
-  { run(); }
+  TestMultiLevelScratchTeam() { run(); }
 
   void run()
   {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-    Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
+    Test::test_team_mulit_level_scratch_test_lambda< ExecSpace, ScheduleType >();
 #endif
-    Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
+    Test::ClassNoShmemSizeFunction< ExecSpace, ScheduleType > c1;
     c1.run();
 
-    Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
+    Test::ClassWithShmemSizeFunction< ExecSpace, ScheduleType > c2;
     c2.run();
-
   }
 };
-}
+
+} // namespace
 
 namespace Test {
 
 template< class ExecSpace >
 struct TestShmemSize {
-
   TestShmemSize() { run(); }
 
   void run()
@@ -915,9 +940,8 @@ struct TestShmemSize {
 
     size_t size = view_type::shmem_size( d1, d2, d3 );
 
-    ASSERT_EQ( size, d1 * d2 * d3 * sizeof(long) );
+    ASSERT_EQ( size, d1 * d2 * d3 * sizeof( long ) );
   }
 };
-}
 
-/*--------------------------------------------------------------------------*/
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
index d9b06c29e49d0362226168861b0d5e818d1d82f9..8d16ac66db8abbf1b5afc3f12aaff7afe0159307 100644
--- a/lib/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,36 +50,47 @@
 namespace TestTeamVector {
 
 struct my_complex {
-  double re,im;
+  double re, im;
   int dummy;
+
   KOKKOS_INLINE_FUNCTION
   my_complex() {
     re = 0.0;
     im = 0.0;
     dummy = 0;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex(const my_complex& src) {
+  my_complex( const my_complex & src ) {
     re = src.re;
     im = src.im;
     dummy = src.dummy;
   }
 
   KOKKOS_INLINE_FUNCTION
-  my_complex(const volatile my_complex& src) {
+  my_complex & operator=( const my_complex & src ) {
     re = src.re;
     im = src.im;
     dummy = src.dummy;
+    return *this ;
   }
 
   KOKKOS_INLINE_FUNCTION
-  my_complex(const double& val) {
+  my_complex( const volatile my_complex & src ) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex( const double & val ) {
     re = val;
     im = 0.0;
     dummy = 0;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator += (const my_complex& src) {
+  my_complex & operator+=( const my_complex & src ) {
     re += src.re;
     im += src.im;
     dummy += src.dummy;
@@ -87,252 +98,278 @@ struct my_complex {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile my_complex& src) volatile {
+  void operator+=( const volatile my_complex & src ) volatile {
     re += src.re;
     im += src.im;
     dummy += src.dummy;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator *= (const my_complex& src) {
-    double re_tmp = re*src.re - im*src.im;
+  my_complex & operator*=( const my_complex & src ) {
+    double re_tmp = re * src.re - im * src.im;
     double im_tmp = re * src.im + im * src.re;
     re = re_tmp;
     im = im_tmp;
     dummy *= src.dummy;
     return *this;
   }
+
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile my_complex& src) volatile {
-    double re_tmp = re*src.re - im*src.im;
+  void operator*=( const volatile my_complex & src ) volatile {
+    double re_tmp = re * src.re - im * src.im;
     double im_tmp = re * src.im + im * src.re;
     re = re_tmp;
     im = im_tmp;
     dummy *= src.dummy;
   }
+
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const my_complex& src) {
-    return (re == src.re) && (im == src.im) && ( dummy == src.dummy );
+  bool operator==( const my_complex & src ) {
+    return ( re == src.re ) && ( im == src.im ) && ( dummy == src.dummy );
   }
+
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const my_complex& src) {
-      return (re != src.re) || (im != src.im) || ( dummy != src.dummy );
+  bool operator!=( const my_complex & src ) {
+    return ( re != src.re ) || ( im != src.im ) || ( dummy != src.dummy );
   }
+
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const double& val) {
-    return (re != val) ||
-           (im != 0) || (dummy != 0);
+  bool operator!=( const double & val ) {
+    return ( re != val ) || ( im != 0 ) || ( dummy != 0 );
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator= (const int& val) {
+  my_complex & operator=( const int & val ) {
     re = val;
     im = 0.0;
     dummy = 0;
     return *this;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator= (const double& val) {
+  my_complex & operator=( const double & val ) {
     re = val;
     im = 0.0;
     dummy = 0;
     return *this;
   }
+
   KOKKOS_INLINE_FUNCTION
   operator double() {
     return re;
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_for {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
-    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
     typedef typename shared_int::size_type size_type;
 
-    const size_type shmemSize = team.team_size () * 13;
-    shared_int values = shared_int (team.team_shmem (), shmemSize);
+    const size_type shmemSize = team.team_size() * 13;
+    shared_int values = shared_int( team.team_shmem(), shmemSize );
 
-    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
-      printf ("FAILED to allocate shared memory of size %u\n",
-              static_cast<unsigned int> (shmemSize));
+    if ( values.ptr_on_device() == NULL || values.dimension_0() < shmemSize ) {
+      printf( "FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int>( shmemSize ) );
     }
     else {
+      // Initialize shared memory.
+      values( team.team_rank() ) = 0;
 
-      // Initialize shared memory
-      values(team.team_rank ()) = 0;
-
-      // Accumulate value into per thread shared memory
-      // This is non blocking
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i)
+      // Accumulate value into per thread shared memory.
+      // This is non blocking.
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i )
       {
-        values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+        values( team.team_rank() ) += i - team.league_rank() + team.league_size() + team.team_size();
       });
-      // Wait for all memory to be written
-      team.team_barrier ();
-      // One thread per team executes the comparison
-      Kokkos::single(Kokkos::PerTeam(team),[&]()
+
+      // Wait for all memory to be written.
+      team.team_barrier();
+
+      // One thread per team executes the comparison.
+      Kokkos::single( Kokkos::PerTeam( team ), [&] ()
       {
-            Scalar test = 0;
-            Scalar value = 0;
-            for (int i = 0; i < 131; ++i) {
-              test += i - team.league_rank () + team.league_size () + team.team_size ();
-            }
-            for (int i = 0; i < team.team_size (); ++i) {
-              value += values(i);
-            }
-            if (test != value) {
-              printf ("FAILED team_parallel_for %i %i %f %f\n",
-                      team.league_rank (), team.team_rank (),
-                      static_cast<double> (test), static_cast<double> (value));
-              flag() = 1;
-            }
+        Scalar test = 0;
+        Scalar value = 0;
+
+        for ( int i = 0; i < 131; ++i ) {
+          test += i - team.league_rank() + team.league_size() + team.team_size();
+        }
+
+        for ( int i = 0; i < team.team_size(); ++i ) {
+          value += values( i );
+        }
+
+        if ( test != value ) {
+          printf ( "FAILED team_parallel_for %i %i %f %f\n",
+                   team.league_rank(), team.team_rank(),
+                   static_cast<double>( test ), static_cast<double>( value ) );
+          flag() = 1;
+        }
       });
     }
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_reduce {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = Scalar();
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val)
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
     {
-      val += i - team.league_rank () + team.league_size () + team.team_size ();
-    },value);
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    }, value );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
-        {
-         Scalar test = 0;
-         for (int i = 0; i < 131; ++i) {
-           test += i - team.league_rank () + team.league_size () + team.team_size ();
-         }
-         if (test != value) {
-           if(team.league_rank() == 0)
-           printf ("FAILED team_parallel_reduce %i %i %f %f %lu\n",
-             team.league_rank (), team.team_rank (),
-             static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
-              flag() = 1;
-         }
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        if ( team.league_rank() == 0 ) {
+          printf( "FAILED team_parallel_reduce %i %i %f %f %lu\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ), sizeof( Scalar ) );
+        }
+
+        flag() = 1;
+      }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_reduce_join {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_reduce_join( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = 0;
 
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
-      , [&] (int i, Scalar& val)
-      {
-        val += i - team.league_rank () + team.league_size () + team.team_size ();
-      }
-      , [&] (volatile Scalar& val, const volatile Scalar& src)
-        {val+=src;}
-      , value
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    },
+      [] ( volatile Scalar & val, const volatile Scalar & src ) { val += src; },
+      value
     );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
     {
-         Scalar test = 0;
-         for (int i = 0; i < 131; ++i) {
-           test += i - team.league_rank () + team.league_size () + team.team_size ();
-         }
-         if (test != value) {
-           printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
-             team.league_rank (), team.team_rank (),
-             static_cast<double> (test), static_cast<double> (value));
-              flag() = 1;
-         }
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        printf( "FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+                team.league_rank(), team.team_rank(),
+                static_cast<double>( test ), static_cast<double>( value ) );
+
+        flag() = 1;
+      }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_vector_for {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_vector_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_vector_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
-    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
     typedef typename shared_int::size_type size_type;
 
-    const size_type shmemSize = team.team_size () * 13;
-    shared_int values = shared_int (team.team_shmem (), shmemSize);
+    const size_type shmemSize = team.team_size() * 13;
+    shared_int values = shared_int( team.team_shmem(), shmemSize );
 
-    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
-      printf ("FAILED to allocate shared memory of size %u\n",
-              static_cast<unsigned int> (shmemSize));
+    if ( values.ptr_on_device() == NULL || values.dimension_0() < shmemSize ) {
+      printf( "FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int>( shmemSize ) );
     }
     else {
-      Kokkos::single(Kokkos::PerThread(team),[&] ()
+      team.team_barrier();
+
+      Kokkos::single( Kokkos::PerThread( team ), [&] ()
       {
-        values(team.team_rank ()) = 0;
+        values( team.team_rank() ) = 0;
       });
 
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i)
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i )
       {
-        Kokkos::single(Kokkos::PerThread(team),[&] ()
+        Kokkos::single( Kokkos::PerThread( team ), [&] ()
         {
-          values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+          values( team.team_rank() ) += i - team.league_rank() + team.league_size() + team.team_size();
         });
       });
 
-      team.team_barrier ();
-      Kokkos::single(Kokkos::PerTeam(team),[&]()
+      team.team_barrier();
+
+      Kokkos::single( Kokkos::PerTeam( team ), [&] ()
       {
         Scalar test = 0;
         Scalar value = 0;
-        for (int i = 0; i < 131; ++i) {
-          test += i - team.league_rank () + team.league_size () + team.team_size ();
+
+        for ( int i = 0; i < 131; ++i ) {
+          test += i - team.league_rank() + team.league_size() + team.team_size();
         }
-        for (int i = 0; i < team.team_size (); ++i) {
-          value += values(i);
+
+        for ( int i = 0; i < team.team_size(); ++i ) {
+          value += values( i );
         }
-        if (test != value) {
-          printf ("FAILED team_vector_parallel_for %i %i %f %f\n",
-                  team.league_rank (), team.team_rank (),
-                  static_cast<double> (test), static_cast<double> (value));
+
+        if ( test != value ) {
+          printf( "FAILED team_vector_parallel_for %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ) );
+
           flag() = 1;
         }
       });
@@ -340,164 +377,176 @@ struct functor_team_vector_for {
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_vector_reduce {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_vector_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_team_vector_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
-
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = Scalar();
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val)
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
     {
-        val += i - team.league_rank () + team.league_size () + team.team_size ();
-    },value);
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    }, value );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
     {
       Scalar test = 0;
-      for (int i = 0; i < 131; ++i) {
-        test += i - team.league_rank () + team.league_size () + team.team_size ();
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
       }
-      if (test != value) {
-        if(team.league_rank() == 0)
-        printf ("FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
-          team.league_rank (), team.team_rank (),
-          static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
-           flag() = 1;
+
+      if ( test != value ) {
+        if ( team.league_rank() == 0 ) {
+          printf( "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ), sizeof( Scalar ) );
+        }
+
+        flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_vector_reduce_join {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_vector_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_vector_reduce_join( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = 0;
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
-      , [&] (int i, Scalar& val)
-      {
-        val += i - team.league_rank () + team.league_size () + team.team_size ();
-      }
-      , [&] (volatile Scalar& val, const volatile Scalar& src)
-        {val+=src;}
-      , value
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    },
+      [] ( volatile Scalar & val, const volatile Scalar & src ) { val += src; },
+      value
     );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
     {
       Scalar test = 0;
-      for (int i = 0; i < 131; ++i) {
-         test += i - team.league_rank () + team.league_size () + team.team_size ();
+
+      for ( int i = 0; i < 131; ++i ) {
+         test += i - team.league_rank() + team.league_size() + team.team_size();
       }
-      if (test != value) {
-        printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
-          team.league_rank (), team.team_rank (),
-          static_cast<double> (test), static_cast<double> (value));
+
+      if ( test != value ) {
+        printf( "FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+                team.league_rank(), team.team_rank(),
+                static_cast<double>( test ), static_cast<double>( value ) );
+
         flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_single {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_single(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
-
-    // Warning: this test case intentionally violates permissable semantics
+  void operator()( typename policy_type::member_type team ) const {
+    // Warning: this test case intentionally violates permissable semantics.
     // It is not valid to get references to members of the enclosing region
     // inside a parallel_for and write to it.
     Scalar value = 0;
 
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13),[&] (int i)
+    Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i )
     {
-      value = i; // This write is violating Kokkos semantics for nested parallelism
+      value = i; // This write is violating Kokkos semantics for nested parallelism.
     });
 
-    Kokkos::single(Kokkos::PerThread(team),[&] (Scalar& val)
+    Kokkos::single( Kokkos::PerThread( team ), [&] ( Scalar & val )
     {
       val = 1;
-    },value);
+    }, value );
 
     Scalar value2 = 0;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13), [&] (int i, Scalar& val)
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
     {
       val += value;
-    },value2);
+    }, value2 );
+
+    if ( value2 != ( value * 13 ) ) {
+      printf( "FAILED vector_single broadcast %i %i %f %f\n",
+              team.league_rank(), team.team_rank(), (double) value2, (double) value );
 
-    if(value2!=(value*13)) {
-      printf("FAILED vector_single broadcast %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) value2,(double) value);
-      flag()=1;
+      flag() = 1;
     }
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_for {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
 
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
-    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
-    shared_int values = shared_int(team.team_shmem(),team.team_size()*13);
+    shared_int values = shared_int( team.team_shmem(), team.team_size() * 13 );
 
-    if (values.ptr_on_device () == NULL ||
-        values.dimension_0() < (unsigned) team.team_size() * 13) {
-      printf ("FAILED to allocate memory of size %i\n",
-              static_cast<int> (team.team_size () * 13));
+    if ( values.ptr_on_device() == NULL || values.dimension_0() < (unsigned) team.team_size() * 13 ) {
+      printf( "FAILED to allocate memory of size %i\n", static_cast<int>( team.team_size() * 13 ) );
       flag() = 1;
     }
     else {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13), [&] (int i)
+      Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i )
       {
-        values(13*team.team_rank() + i) = i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+        values( 13 * team.team_rank() + i ) =
+          i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
       });
 
-      Kokkos::single(Kokkos::PerThread(team),[&] ()
+      Kokkos::single( Kokkos::PerThread( team ), [&] ()
       {
         Scalar test = 0;
         Scalar value = 0;
-        for (int i = 0; i < 13; ++i) {
+
+        for ( int i = 0; i < 13; ++i ) {
           test += i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
-          value += values(13*team.team_rank() + i);
+          value += values( 13 * team.team_rank() + i );
         }
-        if (test != value) {
-          printf ("FAILED vector_par_for %i %i %f %f\n",
-                  team.league_rank (), team.team_rank (),
-                  static_cast<double> (test), static_cast<double> (value));
+
+        if ( test != value ) {
+          printf( "FAILED vector_par_for %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ) );
+
           flag() = 1;
         }
       });
@@ -505,169 +554,192 @@ struct functor_vec_for {
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_red {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_red(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_red( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = 0;
 
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val)
+    // When no reducer is given the default is summation.
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
     {
       val += i;
-    }, value);
+    }, value );
 
-    Kokkos::single(Kokkos::PerThread(team),[&] ()
+    Kokkos::single( Kokkos::PerThread( team ), [&] ()
     {
       Scalar test = 0;
-      for(int i = 0; i < 13; i++) {
-        test+=i;
-      }
-      if(test!=value) {
-        printf("FAILED vector_par_reduce %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
-        flag()=1;
+
+      for ( int i = 0; i < 13; i++ ) test += i;
+
+      if ( test != value ) {
+        printf( "FAILED vector_par_reduce %i %i %f %f\n",
+                team.league_rank(), team.team_rank(), (double) test, (double) value );
+
+        flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_red_join {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_red_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_red_join( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  void operator()( typename policy_type::member_type team ) const {
+    // Must initialize to the identity value for the reduce operation
+    // for this test:
+    //   ( identity, operation ) = ( 1 , *= )
     Scalar value = 1;
 
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13)
-      , [&] (int i, Scalar& val)
-      { val *= i; }
-      , [&] (Scalar& val, const Scalar& src)
-      {val*=src;}
-      , value
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
+    {
+      val *= ( i % 5 + 1 );
+    },
+      [&] ( Scalar & val, const Scalar & src ) { val *= src; },
+      value
     );
 
-    Kokkos::single(Kokkos::PerThread(team),[&] ()
+    Kokkos::single( Kokkos::PerThread( team ), [&] ()
     {
       Scalar test = 1;
-      for(int i = 0; i < 13; i++) {
-        test*=i;
-      }
-      if(test!=value) {
-        printf("FAILED vector_par_reduce_join %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
-        flag()=1;
+
+      for ( int i = 0; i < 13; i++ ) test *= ( i % 5 + 1 );
+
+      if ( test != value ) {
+        printf( "FAILED vector_par_reduce_join %i %i %f %f\n",
+                team.league_rank(), team.team_rank(), (double) test, (double) value );
+
+        flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_scan {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_scan(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_vec_scan( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
-    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val, bool final)
+  void operator()( typename policy_type::member_type team ) const {
+    Kokkos::parallel_scan( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val, bool final )
     {
       val += i;
-      if(final) {
+
+      if ( final ) {
         Scalar test = 0;
-        for(int k = 0; k <= i; k++) {
-          test+=k;
-        }
-        if(test!=val) {
-          printf("FAILED vector_par_scan %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) val);
-          flag()=1;
+        for ( int k = 0; k <= i; k++ ) test += k;
+
+        if ( test != val ) {
+          printf( "FAILED vector_par_scan %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(), (double) test, (double) val );
+
+          flag() = 1;
         }
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_reduce {
   typedef double value_type;
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team, double& sum) const {
+  void operator()( typename policy_type::member_type team, double & sum ) const {
     sum += team.league_rank() * 100 + team.thread_rank();
   }
 };
 
-template<typename Scalar,class ExecutionSpace>
-bool test_scalar(int nteams, int team_size, int test) {
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> d_flag("flag");
-  typename Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace>::HostMirror h_flag("h_flag");
-  h_flag() = 0 ;
-  Kokkos::deep_copy(d_flag,h_flag);
-  
-  if(test==0)
-  Kokkos::parallel_for( std::string("A") , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_red<Scalar, ExecutionSpace>(d_flag));
-  if(test==1)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_red_join<Scalar, ExecutionSpace>(d_flag));
-  if(test==2)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_scan<Scalar, ExecutionSpace>(d_flag));
-  if(test==3)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_for<Scalar, ExecutionSpace>(d_flag));
-  if(test==4)
-  Kokkos::parallel_for( "B" , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_single<Scalar, ExecutionSpace>(d_flag));
-  if(test==5)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
-      functor_team_for<Scalar, ExecutionSpace>(d_flag));
-  if(test==6)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
-      functor_team_reduce<Scalar, ExecutionSpace>(d_flag));
-  if(test==7)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
-      functor_team_reduce_join<Scalar, ExecutionSpace>(d_flag));
-  if(test==8)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_team_vector_for<Scalar, ExecutionSpace>(d_flag));
-  if(test==9)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag));
-  if(test==10)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_team_vector_reduce_join<Scalar, ExecutionSpace>(d_flag));
-  
-  Kokkos::deep_copy(h_flag,d_flag);
-
-  return (h_flag() == 0);
+template< typename Scalar, class ExecutionSpace >
+bool test_scalar( int nteams, int team_size, int test ) {
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > d_flag( "flag" );
+  typename Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace >::HostMirror h_flag( "h_flag" );
+  h_flag() = 0;
+  Kokkos::deep_copy( d_flag, h_flag );
+
+  if ( test == 0 ) {
+    Kokkos::parallel_for( std::string( "A" ), Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_red< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 1 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_red_join< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 2 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_scan< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 3 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 4 ) {
+    Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_single< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 5 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 6 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_reduce< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 7 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_reduce_join< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 8 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 9 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_reduce< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 10 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_reduce_join< Scalar, ExecutionSpace >( d_flag ) );
+  }
+
+  Kokkos::deep_copy( h_flag, d_flag );
+
+  return ( h_flag() == 0 );
 }
 
-template<class ExecutionSpace>
-bool Test(int test) {
+template< class ExecutionSpace >
+bool Test( int test ) {
   bool passed = true;
-  passed = passed && test_scalar<int, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<long long int, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<float, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<double, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<my_complex, ExecutionSpace>(317,33,test);
-  return passed;
-}
+  passed = passed && test_scalar< int, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< long long int, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< float, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< double, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< my_complex, ExecutionSpace >( 317, 33, test );
 
+  return passed;
 }
 
+} // namespace TestTeamVector
diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
index 203c9526792f8a5bbef9dbcb0582ce2d8d3a80e2..7bcf3f8a32691ee8a27bac5ed997ed68c6c39082 100644
--- a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -47,152 +47,162 @@
 
 namespace {
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumPlain {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+
   type view;
-  SumPlain(type view_):view(view_) {}
+
+  SumPlain( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, Scalar& val) {
+  void operator() ( int i, Scalar & val ) {
     val += Scalar();
   }
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumInitJoinFinalValueType {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type;
-  SumInitJoinFinalValueType(type view_):view(view_) {}
+
+  type view;
+
+  SumInitJoinFinalValueType( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(value_type& val) const {
+  void init( value_type & val ) const {
     val = value_type();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, volatile value_type& src) const {
+  void join( volatile value_type & val, volatile value_type & src ) const {
     val += src;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type& val) const {
+  void operator()( int i, value_type & val ) const {
     val += value_type();
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumInitJoinFinalValueType2 {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type;
-  SumInitJoinFinalValueType2(type view_):view(view_) {}
+
+  type view;
+
+  SumInitJoinFinalValueType2( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(volatile value_type& val) const {
+  void init( volatile value_type & val ) const {
     val = value_type();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const volatile value_type& src) const {
+  void join( volatile value_type & val, const volatile value_type & src ) const {
     val += src;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type& val) const {
+  void operator()( int i, value_type & val ) const {
     val += value_type();
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumInitJoinFinalValueTypeArray {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type[];
+
+  type view;
   int n;
-  SumInitJoinFinalValueTypeArray(type view_, int n_):view(view_),n(n_) {}
+
+  SumInitJoinFinalValueTypeArray( type view_, int n_ ) : view( view_ ), n( n_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(value_type val) const {
-    for(int k=0;k<n;k++)
+  void init( value_type val ) const {
+    for ( int k = 0; k < n; k++ ) {
       val[k] = 0;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type val, const volatile value_type src) const {
-    for(int k=0;k<n;k++)
+  void join( volatile value_type val, const volatile value_type src ) const {
+    for ( int k = 0; k < n; k++ ) {
       val[k] += src[k];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type val) const {
-    for(int k=0;k<n;k++)
-      val[k] += k*i;
+  void operator()( int i, value_type val ) const {
+    for ( int k = 0; k < n; k++ ) {
+      val[k] += k * i;
+    }
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumWrongInitJoinFinalValueType {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type;
-  SumWrongInitJoinFinalValueType(type view_):view(view_) {}
+
+  type view;
+
+  SumWrongInitJoinFinalValueType( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& val) const {
+  void init( double & val ) const {
     val = double();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const value_type& src) const {
+  void join( volatile value_type & val, const value_type & src ) const {
     val += src;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type& val) const {
+  void operator()( int i, value_type & val ) const {
     val += value_type();
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 void TestTemplateMetaFunctions() {
-  typedef typename Kokkos::View<Scalar*,ExecutionSpace> type;
-  type a("A",100);
+  typedef typename Kokkos::View< Scalar*, ExecutionSpace > type;
+  type a( "A", 100 );
 /*
-  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
-  ASSERT_EQ(sum_plain_has_init_arg,0);
-  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1);
-  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg2,1);
-  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,0);
-
-  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueTypeArray<Scalar,ExecutionSpace>, Scalar[] >::value;
-  //ASSERT_EQ(sum_initjoinfinalvaluetypearray_has_init_arg,1);
-
-  //printf("Values Init: %i %i %i\n",sum_plain_has_init_arg,sum_initjoinfinalvaluetype_has_init_arg,sum_wronginitjoinfinalvaluetype_has_init_arg);
-
-  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_plain_has_join_arg,0);
-  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1);
-  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1);
-  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,0);
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar, ExecutionSpace>, Scalar & >::value;
+  ASSERT_EQ( sum_plain_has_init_arg, 0 );
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
+  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
+
+  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>, Scalar[] >::value;
+  //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
+
+  //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg, sum_initjoinfinalvaluetype_has_init_arg, sum_wronginitjoinfinalvaluetype_has_init_arg );
+
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_plain_has_join_arg, 0 );
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin< SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
+
+  //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg, sum_initjoinfinalvaluetype_has_join_arg, sum_wronginitjoinfinalvaluetype_has_join_arg );
 */
-  //printf("Values Join: %i %i %i\n",sum_plain_has_join_arg,sum_initjoinfinalvaluetype_has_join_arg,sum_wronginitjoinfinalvaluetype_has_join_arg);
 }
 
-}
+} // namespace
diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp
index 842131debb69b54ad08fd0eb90836510be50d7ca..7d096c24c38ee82a6930ed192858e538e345dc29 100644
--- a/lib/kokkos/core/unit_test/TestTile.hpp
+++ b/lib/kokkos/core/unit_test/TestTile.hpp
@@ -1,12 +1,12 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -35,7 +35,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 
@@ -47,108 +47,96 @@
 
 namespace TestTile {
 
-template < typename Device , typename TileLayout>
+template < typename Device, typename TileLayout >
 struct ReduceTileErrors
 {
-  typedef Device execution_space ;
-
-  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device>  array_type;
-  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft , Device >  tile_type ;
-
-  array_type m_array ;
-
+  typedef Device execution_space;
+  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device >  array_type;
+  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft, Device >  tile_type;
   typedef ptrdiff_t value_type;
 
-  ReduceTileErrors( array_type a )
-    : m_array(a)
-  {}
+  array_type m_array;
 
+  ReduceTileErrors( array_type a ) : m_array( a ) {}
 
   KOKKOS_INLINE_FUNCTION
-  static void init( value_type & errors )
-  {
-    errors = 0;
-  }
+  static void init( value_type & errors ) { errors = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & errors ,
+  static void join( volatile value_type & errors,
                     const volatile value_type & src_errors )
   {
     errors += src_errors;
   }
 
-  // Initialize
+  // Initialize.
   KOKKOS_INLINE_FUNCTION
   void operator()( size_t iwork ) const
   {
     const size_t i = iwork % m_array.dimension_0();
     const size_t j = iwork / m_array.dimension_0();
-    if ( j < m_array.dimension_1() ) {
-      m_array(i,j) = & m_array(i,j) - & m_array(0,0);
 
-// printf("m_array(%d,%d) = %d\n",int(i),int(j),int(m_array(i,j)));
+    if ( j < m_array.dimension_1() ) {
+      m_array( i, j ) = &m_array( i, j ) - &m_array( 0, 0 );
 
+      //printf( "m_array(%d, %d) = %d\n", int( i ), int( j ), int( m_array( i, j ) ) );
     }
   }
 
   // Verify:
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_t iwork , value_type & errors ) const
+  void operator()( size_t iwork, value_type & errors ) const
   {
-    const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0 ;
-    const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1 ;
+    const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0;
+    const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1;
 
-    const size_t itile = iwork % tile_dim0 ;
-    const size_t jtile = iwork / tile_dim0 ;
+    const size_t itile = iwork % tile_dim0;
+    const size_t jtile = iwork / tile_dim0;
 
     if ( jtile < tile_dim1 ) {
+      tile_type tile = Kokkos::Experimental::tile_subview( m_array, itile, jtile );
 
-      tile_type tile = Kokkos::Experimental::tile_subview( m_array , itile , jtile );
-
-      if ( tile(0,0) != ptrdiff_t(( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
-        ++errors ;
+      if ( tile( 0, 0 ) != ptrdiff_t( ( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
+        ++errors;
       }
       else {
+        for ( size_t j = 0; j < size_t( TileLayout::N1 ); ++j ) {
+          for ( size_t i = 0; i < size_t( TileLayout::N0 ); ++i ) {
+            const size_t iglobal = i + itile * TileLayout::N0;
+            const size_t jglobal = j + jtile * TileLayout::N1;
 
-        for ( size_t j = 0 ; j < size_t(TileLayout::N1) ; ++j ) {
-        for ( size_t i = 0 ; i < size_t(TileLayout::N0) ; ++i ) {
-          const size_t iglobal = i + itile * TileLayout::N0 ;
-          const size_t jglobal = j + jtile * TileLayout::N1 ;
-
-          if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) {
-            if ( tile(i,j) != ptrdiff_t( tile(0,0) + i + j * TileLayout::N0 ) ) ++errors ;
-
-// printf("tile(%d,%d)(%d,%d) = %d\n",int(itile),int(jtile),int(i),int(j),int(tile(i,j)));
+            if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) {
+              if ( tile( i, j ) != ptrdiff_t( tile( 0, 0 ) + i + j * TileLayout::N0 ) ) ++errors;
 
+              //printf( "tile(%d, %d)(%d, %d) = %d\n", int( itile ), int( jtile ), int( i ), int( j ), int( tile( i, j ) ) );
+            }
           }
         }
-        }
       }
     }
   }
 };
 
-template< class Space , unsigned N0 , unsigned N1 >
-void test( const size_t dim0 , const size_t dim1 )
+template< class Space, unsigned N0, unsigned N1 >
+void test( const size_t dim0, const size_t dim1 )
 {
-  typedef Kokkos::LayoutTileLeft<N0,N1>  array_layout ;
-  typedef ReduceTileErrors< Space , array_layout > functor_type ;
+  typedef Kokkos::LayoutTileLeft< N0, N1 >  array_layout;
+  typedef ReduceTileErrors< Space, array_layout > functor_type;
 
-  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0 ;
-  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1 ;
-  
-  typename functor_type::array_type array("",dim0,dim1);
+  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0;
+  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1;
 
-  Kokkos::parallel_for( Kokkos::RangePolicy<Space,size_t>(0,dim0*dim1) , functor_type( array ) );
+  typename functor_type::array_type array( "", dim0, dim1 );
 
-  ptrdiff_t error = 0 ;
+  Kokkos::parallel_for( Kokkos::RangePolicy< Space, size_t >( 0, dim0 * dim1 ), functor_type( array ) );
 
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space,size_t>(0,tile_dim0*tile_dim1) , functor_type( array ) , error );
+  ptrdiff_t error = 0;
 
-  EXPECT_EQ( error , ptrdiff_t(0) );
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< Space, size_t >( 0, tile_dim0 * tile_dim1 ), functor_type( array ), error );
+
+  EXPECT_EQ( error, ptrdiff_t( 0 ) );
 }
 
-} /* namespace TestTile */
+} // namespace TestTile
 
 #endif //TEST_TILE_HPP
-
diff --git a/lib/kokkos/core/unit_test/TestUtilities.hpp b/lib/kokkos/core/unit_test/TestUtilities.hpp
index 947be03e399bee3c23f4c4f333c34c0e6a9d4d08..be4a93b8942cdfd69e97f68b9ea109a2be10de19 100644
--- a/lib/kokkos/core/unit_test/TestUtilities.hpp
+++ b/lib/kokkos/core/unit_test/TestUtilities.hpp
@@ -49,258 +49,253 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
 inline
 void test_utilities()
 {
   using namespace Kokkos::Impl;
+
   {
-    using i = integer_sequence<int>;
-    using j = make_integer_sequence<int,0>;
+    using i = integer_sequence< int >;
+    using j = make_integer_sequence< int, 0 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 0u, "Error: integer_sequence.size()" );
   }
 
-
   {
-    using i = integer_sequence<int,0>;
-    using j = make_integer_sequence<int,1>;
+    using i = integer_sequence< int, 0 >;
+    using j = make_integer_sequence< int, 1 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 1u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
   }
 
-
   {
-    using i = integer_sequence<int,0,1>;
-    using j = make_integer_sequence<int,2>;
+    using i = integer_sequence< int, 0, 1 >;
+    using j = make_integer_sequence< int, 2 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 2u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2>;
-    using j = make_integer_sequence<int,3>;
+    using i = integer_sequence< int, 0, 1, 2 >;
+    using j = make_integer_sequence< int, 3 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 3u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3>;
-    using j = make_integer_sequence<int,4>;
+    using i = integer_sequence< int, 0, 1, 2, 3 >;
+    using j = make_integer_sequence< int, 4 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 4u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4>;
-    using j = make_integer_sequence<int,5>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4 >;
+    using j = make_integer_sequence< int, 5 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 5u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5>;
-    using j = make_integer_sequence<int,6>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5 >;
+    using j = make_integer_sequence< int, 6 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 6u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6>;
-    using j = make_integer_sequence<int,7>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6 >;
+    using j = make_integer_sequence< int, 7 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 7u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6,7>;
-    using j = make_integer_sequence<int,8>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7 >;
+    using j = make_integer_sequence< int, 8 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 8u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<7, i>::value == 7, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6,7,8>;
-    using j = make_integer_sequence<int,9>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7, 8 >;
+    using j = make_integer_sequence< int, 9 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 9u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<7, i>::value == 7, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<8, i>::value == 8, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(8, i{}) == 8, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 8, i >::value == 8, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 8, i{} ) == 8, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6,7,8,9>;
-    using j = make_integer_sequence<int,10>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 >;
+    using j = make_integer_sequence< int, 10 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 10u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<7, i>::value == 7, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<8, i>::value == 8, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<9, i>::value == 9, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(8, i{}) == 8, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(9, i{}) == 9, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 8, i >::value == 8, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 9, i >::value == 9, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 8, i{} ) == 8, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 9, i{} ) == 9, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = make_integer_sequence<int, 5>;
-    using r = reverse_integer_sequence<i>;
-    using gr = integer_sequence<int, 4, 3, 2, 1, 0>;
+    using i = make_integer_sequence< int, 5 >;
+    using r = reverse_integer_sequence< i >;
+    using gr = integer_sequence< int, 4, 3, 2, 1, 0 >;
 
-    static_assert( std::is_same<r,gr>::value, "Error: reverse_integer_sequence" );
+    static_assert( std::is_same< r, gr >::value, "Error: reverse_integer_sequence" );
   }
 
   {
-    using s = make_integer_sequence<int,10>;
-    using e = exclusive_scan_integer_sequence<s>;
-    using i = inclusive_scan_integer_sequence<s>;
+    using s = make_integer_sequence< int, 10 >;
+    using e = exclusive_scan_integer_sequence< s >;
+    using i = inclusive_scan_integer_sequence< s >;
 
-    using ge = integer_sequence<int, 0, 0, 1, 3, 6, 10, 15, 21, 28, 36>;
-    using gi = integer_sequence<int, 0, 1, 3, 6, 10, 15, 21, 28, 36, 45>;
+    using ge = integer_sequence< int, 0, 0, 1, 3, 6, 10, 15, 21, 28, 36 >;
+    using gi = integer_sequence< int, 0, 1, 3, 6, 10, 15, 21, 28, 36, 45 >;
 
-    static_assert( e::value == 45, "Error: scan value");
-    static_assert( i::value == 45, "Error: scan value");
+    static_assert( e::value == 45, "Error: scan value" );
+    static_assert( i::value == 45, "Error: scan value" );
 
-    static_assert( std::is_same< e::type, ge >::value, "Error: exclusive_scan");
-    static_assert( std::is_same< i::type, gi >::value, "Error: inclusive_scan");
+    static_assert( std::is_same< e::type, ge >::value, "Error: exclusive_scan" );
+    static_assert( std::is_same< i::type, gi >::value, "Error: inclusive_scan" );
   }
-
-
 }
 
 } // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
index a96f31cc12f227a66097c595e1f0fb44dd17a8c4..cbf86dc58c78fb44442d08497874a667f3923efb 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,103 +48,92 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
-
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
-template< class T , class ... P >
-size_t allocation_count( const Kokkos::View<T,P...> & view )
+template< class T, class ... P >
+size_t allocation_count( const Kokkos::View< T, P... > & view )
 {
   const size_t card  = view.size();
   const size_t alloc = view.span();
 
-  const int memory_span = Kokkos::View<int*>::required_allocation_size(100);
+  const int memory_span = Kokkos::View< int* >::required_allocation_size( 100 );
 
-  return (card <= alloc && memory_span == 400) ? alloc : 0 ;
+  return ( card <= alloc && memory_span == 400 ) ? alloc : 0;
 }
 
 /*--------------------------------------------------------------------------*/
 
-template< typename T, class DeviceType>
+template< typename T, class DeviceType >
 struct TestViewOperator
 {
-  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::execution_space  execution_space;
 
-  static const unsigned N = 100 ;
-  static const unsigned D = 3 ;
+  static const unsigned N = 100;
+  static const unsigned D = 3;
 
-  typedef Kokkos::View< T*[D] , execution_space > view_type ;
+  typedef Kokkos::View< T*[D], execution_space > view_type;
 
-  const view_type v1 ;
-  const view_type v2 ;
+  const view_type v1;
+  const view_type v2;
 
   TestViewOperator()
-    : v1( "v1" , N )
-    , v2( "v2" , N )
+    : v1( "v1", N )
+    , v2( "v2", N )
     {}
 
   static void testit()
   {
-    Kokkos::parallel_for( N , TestViewOperator() );
+    Kokkos::parallel_for( N, TestViewOperator() );
   }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const unsigned i ) const
   {
-    const unsigned X = 0 ;
-    const unsigned Y = 1 ;
-    const unsigned Z = 2 ;
+    const unsigned X = 0;
+    const unsigned Y = 1;
+    const unsigned Z = 2;
 
-    v2(i,X) = v1(i,X);
-    v2(i,Y) = v1(i,Y);
-    v2(i,Z) = v1(i,Z);
+    v2( i, X ) = v1( i, X );
+    v2( i, Y ) = v1( i, Y );
+    v2( i, Z ) = v1( i, Z );
   }
 };
 
 /*--------------------------------------------------------------------------*/
 
-template< class DataType ,
-          class DeviceType ,
+template< class DataType,
+          class DeviceType,
           unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
-struct TestViewOperator_LeftAndRight ;
+struct TestViewOperator_LeftAndRight;
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 8 >
 {
-  typedef typename DeviceType::execution_space    execution_space ;
-  typedef typename DeviceType::memory_space       memory_space ;
-  typedef typename execution_space::size_type     size_type ;
+  typedef typename DeviceType::execution_space    execution_space;
+  typedef typename DeviceType::memory_space       memory_space;
+  typedef typename execution_space::size_type     size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -157,93 +146,89 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i7 = 0 ; i7 < unsigned(left.dimension_7()) ; ++i7 )
-    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i7 = 0; i7 < unsigned( left.dimension_7() ); ++i7 )
+    for ( unsigned i6 = 0; i6 < unsigned( left.dimension_6() ); ++i6 )
+    for ( unsigned i5 = 0; i5 < unsigned( left.dimension_5() ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4, i5, i6, i7 ) -
                      & left(  0,  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
 
-      if ( & left(i0,i1,i2,i3,i4,i5,i6,i7) !=
-           & left_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
-        update |= 4 ;
+      if ( & left( i0, i1, i2, i3, i4, i5, i6, i7 ) !=
+           & left_stride( i0, i1, i2, i3, i4, i5, i6, i7 ) ) {
+        update |= 4;
       }
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
-    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
-    for ( unsigned i7 = 0 ; i7 < unsigned(right.dimension_7()) ; ++i7 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.dimension_5() ); ++i5 )
+    for ( unsigned i6 = 0; i6 < unsigned( right.dimension_6() ); ++i6 )
+    for ( unsigned i7 = 0; i7 < unsigned( right.dimension_7() ); ++i7 )
     {
       const long j = & right( i0, i1, i2, i3, i4, i5, i6, i7 ) -
                      & right(  0,  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
 
-      if ( & right(i0,i1,i2,i3,i4,i5,i6,i7) !=
-           & right_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
-        update |= 8 ;
+      if ( & right( i0, i1, i2, i3, i4, i5, i6, i7 ) !=
+           & right_stride( i0, i1, i2, i3, i4, i5, i6, i7 ) ) {
+        update |= 8;
       }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 7 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -254,81 +239,77 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i6 = 0; i6 < unsigned( left.dimension_6() ); ++i6 )
+    for ( unsigned i5 = 0; i5 < unsigned( left.dimension_5() ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
                      & left(  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
-    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.dimension_5() ); ++i5 )
+    for ( unsigned i6 = 0; i6 < unsigned( right.dimension_6() ); ++i6 )
     {
       const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
                      & right(  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 6 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -339,84 +320,78 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i5 = 0; i5 < unsigned( left.dimension_5() ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4, i5 ) -
                      & left(  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.dimension_5() ); ++i5 )
     {
       const long j = & right( i0, i1, i2, i3, i4, i5 ) -
                      & right(  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 5 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -429,83 +404,79 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4 ) -
                      & left(  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
 
       if ( & left( i0, i1, i2, i3, i4 ) !=
-           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; }
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4; }
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
     {
       const long j = & right( i0, i1, i2, i3, i4 ) -
                      & right(  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
 
       if ( & right( i0, i1, i2, i3, i4 ) !=
-           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; }
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8; }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 4 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -516,84 +487,78 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
+    long offset = -1;
 
-    offset = -1 ;
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3 ) -
                      & left(  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
     {
       const long j = & right( i0, i1, i2, i3 ) -
                      & right(  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 3 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
-    : left(  std::string("left") )
-    , right( std::string("right") )
+    : left(  std::string( "left" ) )
+    , right( std::string( "right" ) )
     , left_stride( left )
     , right_stride( right )
     , left_alloc( allocation_count( left ) )
@@ -602,85 +567,81 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
+    long offset = -1;
 
-    offset = -1 ;
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2 ) -
                      & left(  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
 
-      if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; }
+      if ( & left( i0, i1, i2 ) != & left_stride( i0, i1, i2 ) ) { update |= 4; }
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
     {
       const long j = & right( i0, i1, i2 ) -
                      & right(  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
 
-      if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; }
+      if ( & right( i0, i1, i2 ) != & right_stride( i0, i1, i2 ) ) { update |= 8; }
     }
 
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
     {
-      if ( & left(i0,i1,i2)  != & left(i0,i1,i2,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0,i1,i2) != & right(i0,i1,i2,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left( i0, i1, i2 )  != & left( i0, i1, i2, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0, i1, i2 ) != & right( i0, i1, i2, 0, 0, 0, 0, 0 ) ) { update |= 3; }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 2 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -691,83 +652,77 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
+    long offset = -1;
 
-    offset = -1 ;
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1 ) -
                      & left(  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
     {
       const long j = & right( i0, i1 ) -
                      & right(  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
 
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
     {
-      if ( & left(i0,i1)  != & left(i0,i1,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0,i1) != & right(i0,i1,0,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left( i0, i1 )  != & left( i0, i1, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0, i1 ) != & right( i0, i1, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 1 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -780,78 +735,75 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
-      if ( & left(i0)  != & left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0) != & right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
-      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
-      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
+      if ( & left( i0 )  != & left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0 ) != & right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( & left( i0 )  != & left_stride( i0 ) ) { update |= 4; }
+      if ( & right( i0 ) != & right_stride( i0 ) ) { update |= 8; }
     }
   }
 };
 
-template<class Layout, class DeviceType>
-struct TestViewMirror {
-
-  template<class MemoryTraits>
+template< class Layout, class DeviceType >
+struct TestViewMirror
+{
+  template< class MemoryTraits >
   void static test_mirror() {
-    Kokkos::View<double*, Layout, Kokkos::HostSpace> a_org("A",1000);
-    Kokkos::View<double*, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org;
-    auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
-    auto a_d = Kokkos::create_mirror(DeviceType(),a_h);
-
-    int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
-    int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
-    int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
-
-    ASSERT_EQ(equal_ptr_h_h2,0);
-    ASSERT_EQ(equal_ptr_h_d ,0);
-    ASSERT_EQ(equal_ptr_h2_d,0);
-    
-
-    ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
-    ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
-  }
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 1000 );
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror( DeviceType(), a_h );
 
+    int equal_ptr_h_h2 = ( a_h.data()  == a_h2.data() ) ? 1 : 0;
+    int equal_ptr_h_d  = ( a_h.data()  ==  a_d.data() ) ? 1 : 0;
+    int equal_ptr_h2_d = ( a_h2.data() ==  a_d.data() ) ? 1 : 0;
 
-  template<class MemoryTraits>
-  void static test_mirror_view() {
-    Kokkos::View<double*, Layout, Kokkos::HostSpace> a_org("A",1000);
-    Kokkos::View<double*, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org;
-    auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
-    auto a_d = Kokkos::create_mirror_view(DeviceType(),a_h);
-
-    int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
-    int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
-    int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
-
-    int is_same_memspace = std::is_same<Kokkos::HostSpace,typename DeviceType::memory_space>::value?1:0; 
-    ASSERT_EQ(equal_ptr_h_h2,1);
-    ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
-    ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+    ASSERT_EQ( equal_ptr_h_h2, 0 );
+    ASSERT_EQ( equal_ptr_h_d, 0 );
+    ASSERT_EQ( equal_ptr_h2_d, 0 );
 
+    ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() );
+    ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
+  }
 
-    ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
-    ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
-  } 
+  template< class MemoryTraits >
+  void static test_mirror_view() {
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 1000 );
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror_view( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror_view( DeviceType(), a_h );
+
+    int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+    int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+
+    int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+    ASSERT_EQ( equal_ptr_h_h2, 1 );
+    ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+
+    ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() );
+    ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
+  }
 
   void static testit() {
-    test_mirror<Kokkos::MemoryTraits<0>>();
-    test_mirror<Kokkos::MemoryTraits<Kokkos::Unmanaged>>();
-    test_mirror_view<Kokkos::MemoryTraits<0>>();
-    test_mirror_view<Kokkos::MemoryTraits<Kokkos::Unmanaged>>();
+    test_mirror< Kokkos::MemoryTraits<0> >();
+    test_mirror< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
+    test_mirror_view< Kokkos::MemoryTraits<0> >();
+    test_mirror_view< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
   }
 };
 
@@ -861,23 +813,21 @@ template< typename T, class DeviceType >
 class TestViewAPI
 {
 public:
-  typedef DeviceType        device ;
+  typedef DeviceType device;
 
-  enum { N0 = 1000 ,
-         N1 = 3 ,
-         N2 = 5 ,
+  enum { N0 = 1000,
+         N1 = 3,
+         N2 = 5,
          N3 = 7 };
 
-  typedef Kokkos::View< T , device > dView0 ;
-  typedef Kokkos::View< T* , device > dView1 ;
-  typedef Kokkos::View< T*[N1] , device > dView2 ;
-  typedef Kokkos::View< T*[N1][N2] , device > dView3 ;
-  typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ;
-  typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ;
-
-  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ;
-
-  typedef typename dView0::host_mirror_space host ;
+  typedef Kokkos::View< T, device > dView0;
+  typedef Kokkos::View< T*, device > dView1;
+  typedef Kokkos::View< T*[N1], device > dView2;
+  typedef Kokkos::View< T*[N1][N2], device > dView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device > dView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device > const_dView4;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged;
+  typedef typename dView0::host_mirror_space host;
 
   TestViewAPI()
   {
@@ -889,41 +839,38 @@ public:
     run_test_subview_strided();
     run_test_vector();
 
-    TestViewOperator< T , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2] , device >::testit();
-    TestViewMirror<Kokkos::LayoutLeft, device >::testit(); 
-    TestViewMirror<Kokkos::LayoutRight, device >::testit(); 
-
+    TestViewOperator< T, device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3], device >::testit();
+    TestViewOperator_LeftAndRight< int[2], device >::testit();
+    TestViewMirror< Kokkos::LayoutLeft, device >::testit();
+    TestViewMirror< Kokkos::LayoutRight, device >::testit();
   }
 
   static void run_test_mirror()
   {
-    typedef Kokkos::View< int , host > view_type ;
-    typedef typename view_type::HostMirror mirror_type ;
+    typedef Kokkos::View< int, host > view_type;
+    typedef typename view_type::HostMirror mirror_type;
 
-    static_assert( std::is_same< typename view_type::memory_space
-                               , typename mirror_type::memory_space
-                               >::value , "" );
+    static_assert( std::is_same< typename view_type::memory_space, typename mirror_type::memory_space >::value, "" );
 
-    view_type a("a");
-    mirror_type am = Kokkos::create_mirror_view(a);
-    mirror_type ax = Kokkos::create_mirror(a);
-    ASSERT_EQ( & a() , & am() );
+    view_type a( "a" );
+    mirror_type am = Kokkos::create_mirror_view( a );
+    mirror_type ax = Kokkos::create_mirror( a );
+    ASSERT_EQ( & a(), & am() );
   }
 
   static void run_test_scalar()
   {
-    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView0::HostMirror  hView0;
 
-    dView0 dx , dy ;
-    hView0 hx , hy ;
+    dView0 dx, dy;
+    hView0 hx, hy;
 
     dx = dView0( "dx" );
     dy = dView0( "dy" );
@@ -931,11 +878,11 @@ public:
     hx = Kokkos::create_mirror( dx );
     hy = Kokkos::create_mirror( dy );
 
-    hx() = 1 ;
+    hx() = 1;
 
-    Kokkos::deep_copy( dx , hx );
-    Kokkos::deep_copy( dy , dx );
-    Kokkos::deep_copy( hy , dy );
+    Kokkos::deep_copy( dx, hx );
+    Kokkos::deep_copy( dy, dx );
+    Kokkos::deep_copy( hy, dy );
 
     ASSERT_EQ( hx(), hy() );
   }
@@ -948,11 +895,11 @@ public:
     // usual "(void)" marker to avoid compiler warnings for unused
     // variables.
 
-    typedef typename dView0::HostMirror  hView0 ;
-    typedef typename dView1::HostMirror  hView1 ;
-    typedef typename dView2::HostMirror  hView2 ;
-    typedef typename dView3::HostMirror  hView3 ;
-    typedef typename dView4::HostMirror  hView4 ;
+    typedef typename dView0::HostMirror  hView0;
+    typedef typename dView1::HostMirror  hView1;
+    typedef typename dView2::HostMirror  hView2;
+    typedef typename dView3::HostMirror  hView3;
+    typedef typename dView4::HostMirror  hView4;
 
     {
       hView0 thing;
@@ -975,8 +922,8 @@ public:
       (void) thing;
     }
 
-    dView4 dx , dy , dz ;
-    hView4 hx , hy , hz ;
+    dView4 dx, dy, dz;
+    hView4 hx, hy, hz;
 
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
@@ -984,220 +931,239 @@ public:
     ASSERT_TRUE( hx.ptr_on_device() == 0 );
     ASSERT_TRUE( hy.ptr_on_device() == 0 );
     ASSERT_TRUE( hz.ptr_on_device() == 0 );
-    ASSERT_EQ( dx.dimension_0() , 0u );
-    ASSERT_EQ( dy.dimension_0() , 0u );
-    ASSERT_EQ( dz.dimension_0() , 0u );
-    ASSERT_EQ( hx.dimension_0() , 0u );
-    ASSERT_EQ( hy.dimension_0() , 0u );
-    ASSERT_EQ( hz.dimension_0() , 0u );
-    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dz.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( hz.dimension_1() , unsigned(N1) );
-
-    dx = dView4( "dx" , N0 );
-    dy = dView4( "dy" , N0 );
-
-    ASSERT_EQ( dx.use_count() , size_t(1) );
+    ASSERT_EQ( dx.dimension_0(), 0u );
+    ASSERT_EQ( dy.dimension_0(), 0u );
+    ASSERT_EQ( dz.dimension_0(), 0u );
+    ASSERT_EQ( hx.dimension_0(), 0u );
+    ASSERT_EQ( hy.dimension_0(), 0u );
+    ASSERT_EQ( hz.dimension_0(), 0u );
+    ASSERT_EQ( dx.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dy.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dz.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( hx.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( hy.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( hz.dimension_1(), unsigned( N1 ) );
+
+    dx = dView4( "dx", N0 );
+    dy = dView4( "dy", N0 );
+
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
 
     dView4_unmanaged unmanaged_dx = dx;
-    ASSERT_EQ( dx.use_count() , size_t(1) );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
 
-    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged(dx.ptr_on_device(),
-                                                              dx.dimension_0(),
-                                                              dx.dimension_1(),
-                                                              dx.dimension_2(),
-                                                              dx.dimension_3());
+    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged( dx.ptr_on_device(),
+                                                               dx.dimension_0(),
+                                                               dx.dimension_1(),
+                                                               dx.dimension_2(),
+                                                               dx.dimension_3() );
 
     {
-      // Destruction of this view should be harmless
-      const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
-                                                dx.dimension_0() ,
-                                                dx.dimension_1() ,
-                                                dx.dimension_2() ,
+      // Destruction of this view should be harmless.
+      const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device(),
+                                                dx.dimension_0(),
+                                                dx.dimension_1(),
+                                                dx.dimension_2(),
                                                 dx.dimension_3() );
     }
 
-    const_dView4 const_dx = dx ;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
+    const_dView4 const_dx = dx;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
     {
       const_dView4 const_dx2;
       const_dx2 = const_dx;
-      ASSERT_EQ( dx.use_count() , size_t(3) );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
 
       const_dx2 = dy;
-      ASSERT_EQ( dx.use_count() , size_t(2) );
+      ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
-      const_dView4 const_dx3(dx);
-      ASSERT_EQ( dx.use_count() , size_t(3) );
-      
-      dView4_unmanaged dx4_unmanaged(dx);
-      ASSERT_EQ( dx.use_count() , size_t(3) );
-    }
+      const_dView4 const_dx3( dx );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
 
-    ASSERT_EQ( dx.use_count() , size_t(2) );
+      dView4_unmanaged dx4_unmanaged( dx );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    }
 
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
     ASSERT_FALSE( dx.ptr_on_device() == 0 );
     ASSERT_FALSE( const_dx.ptr_on_device() == 0 );
     ASSERT_FALSE( unmanaged_dx.ptr_on_device() == 0 );
     ASSERT_FALSE( unmanaged_from_ptr_dx.ptr_on_device() == 0 );
     ASSERT_FALSE( dy.ptr_on_device() == 0 );
-    ASSERT_NE( dx , dy );
+    ASSERT_NE( dx, dy );
 
-    ASSERT_EQ( dx.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dx.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( dx.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( dx.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( dx.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dx.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( dx.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( dy.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dy.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( dy.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( dy.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( dy.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dy.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( dy.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(), unsigned( N0 ) * unsigned( N1 ) * unsigned( N2 ) * unsigned( N3 ) );
 
     hx = Kokkos::create_mirror( dx );
     hy = Kokkos::create_mirror( dy );
 
-    // T v1 = hx() ;    // Generates compile error as intended
-    // T v2 = hx(0,0) ; // Generates compile error as intended
-    // hx(0,0) = v2 ;   // Generates compile error as intended
+    // T v1 = hx();       // Generates compile error as intended.
+    // T v2 = hx( 0, 0 ); // Generates compile error as intended.
+    // hx( 0, 0 ) = v2;   // Generates compile error as intended.
 
     // Testing with asynchronous deep copy with respect to device
     {
-      size_t count = 0 ;
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
-        hx(ip,i1,i2,i3) = ++count ;
-      }}}}
-
-
-      Kokkos::deep_copy(typename hView4::execution_space(), dx , hx );
-      Kokkos::deep_copy(typename hView4::execution_space(), dy , dx );
-      Kokkos::deep_copy(typename hView4::execution_space(), hy , dy );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
-      }}}}
-
-      Kokkos::deep_copy(typename hView4::execution_space(), dx , T(0) );
-      Kokkos::deep_copy(typename hView4::execution_space(), hx , dx );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
-      }}}}
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.dimension_1(); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.dimension_2(); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.dimension_3(); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( typename hView4::execution_space(), dx, hx );
+      Kokkos::deep_copy( typename hView4::execution_space(), dy, dx );
+      Kokkos::deep_copy( typename hView4::execution_space(), hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( typename hView4::execution_space(), dx, T( 0 ) );
+      Kokkos::deep_copy( typename hView4::execution_space(), hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
     }
 
-    // Testing with asynchronous deep copy with respect to host
+    // Testing with asynchronous deep copy with respect to host.
     {
-      size_t count = 0 ;
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
-        hx(ip,i1,i2,i3) = ++count ;
-      }}}}
-
-      Kokkos::deep_copy(typename dView4::execution_space(), dx , hx );
-      Kokkos::deep_copy(typename dView4::execution_space(), dy , dx );
-      Kokkos::deep_copy(typename dView4::execution_space(), hy , dy );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
-      }}}}
-
-      Kokkos::deep_copy(typename dView4::execution_space(), dx , T(0) );
-      Kokkos::deep_copy(typename dView4::execution_space(), hx , dx );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
-      }}}}
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.dimension_1(); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.dimension_2(); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.dimension_3(); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( typename dView4::execution_space(), dx, hx );
+      Kokkos::deep_copy( typename dView4::execution_space(), dy, dx );
+      Kokkos::deep_copy( typename dView4::execution_space(), hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( typename dView4::execution_space(), dx, T( 0 ) );
+      Kokkos::deep_copy( typename dView4::execution_space(), hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
     }
 
-    // Testing with synchronous deep copy
+    // Testing with synchronous deep copy.
     {
-      size_t count = 0 ;
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
-        hx(ip,i1,i2,i3) = ++count ;
-      }}}}
-
-      Kokkos::deep_copy( dx , hx );
-      Kokkos::deep_copy( dy , dx );
-      Kokkos::deep_copy( hy , dy );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
-      }}}}
-
-      Kokkos::deep_copy( dx , T(0) );
-      Kokkos::deep_copy( hx , dx );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
-      }}}}
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.dimension_1(); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.dimension_2(); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.dimension_3(); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( dx, hx );
+      Kokkos::deep_copy( dy, dx );
+      Kokkos::deep_copy( hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( dx, T( 0 ) );
+      Kokkos::deep_copy( hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
     }
-    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
-    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
+
+    dz = dx;
+    ASSERT_EQ( dx, dz );
+    ASSERT_NE( dy, dz );
+
+    dz = dy;
+    ASSERT_EQ( dy, dz );
+    ASSERT_NE( dx, dz );
 
     dx = dView4();
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_FALSE( dy.ptr_on_device() == 0 );
     ASSERT_FALSE( dz.ptr_on_device() == 0 );
+
     dy = dView4();
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
     ASSERT_FALSE( dz.ptr_on_device() == 0 );
+
     dz = dView4();
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
     ASSERT_TRUE( dz.ptr_on_device() == 0 );
   }
 
-  typedef T DataType[2] ;
+  typedef T DataType[2];
 
   static void
   check_auto_conversion_to_const(
-     const Kokkos::View< const DataType , device > & arg_const ,
-     const Kokkos::View< DataType , device > & arg )
+     const Kokkos::View< const DataType, device > & arg_const,
+     const Kokkos::View< DataType, device > & arg )
   {
     ASSERT_TRUE( arg_const == arg );
   }
 
   static void run_test_const()
   {
-    typedef Kokkos::View< DataType , device > typeX ;
-    typedef Kokkos::View< const DataType , device > const_typeX ;
-    typedef Kokkos::View< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ;
+    typedef Kokkos::View< DataType, device > typeX;
+    typedef Kokkos::View< const DataType, device > const_typeX;
+    typedef Kokkos::View< const DataType, device, Kokkos::MemoryRandomAccess > const_typeR;
+
     typeX x( "X" );
-    const_typeX xc = x ;
-    const_typeR xr = x ;
+    const_typeX xc = x;
+    const_typeR xr = x;
 
     ASSERT_TRUE( xc == x );
     ASSERT_TRUE( x == xc );
@@ -1206,144 +1172,142 @@ public:
     // an lvalue reference due to retrieving through texture cache
     // therefore not allowed to query the underlying pointer.
 #if defined( KOKKOS_ENABLE_CUDA )
-    if ( ! std::is_same< typename device::execution_space , Kokkos::Cuda >::value )
+    if ( !std::is_same< typename device::execution_space, Kokkos::Cuda >::value )
 #endif
     {
       ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() );
     }
 
-    // typeX xf = xc ; // setting non-const from const must not compile
+    // typeX xf = xc; // Setting non-const from const must not compile.
 
-    check_auto_conversion_to_const( x , x );
+    check_auto_conversion_to_const( x, x );
   }
 
   static void run_test_subview()
   {
-    typedef Kokkos::View< const T , device > sView ;
+    typedef Kokkos::View< const T, device > sView;
 
     dView0 d0( "d0" );
-    dView1 d1( "d1" , N0 );
-    dView2 d2( "d2" , N0 );
-    dView3 d3( "d3" , N0 );
-    dView4 d4( "d4" , N0 );
-
-    sView s0 = d0 ;
-    sView s1 = Kokkos::subview( d1 , 1 );
-    sView s2 = Kokkos::subview( d2 , 1 , 1 );
-    sView s3 = Kokkos::subview( d3 , 1 , 1 , 1 );
-    sView s4 = Kokkos::subview( d4 , 1 , 1 , 1 , 1 );
+    dView1 d1( "d1", N0 );
+    dView2 d2( "d2", N0 );
+    dView3 d3( "d3", N0 );
+    dView4 d4( "d4", N0 );
+
+    sView s0 = d0;
+    sView s1 = Kokkos::subview( d1, 1 );
+    sView s2 = Kokkos::subview( d2, 1, 1 );
+    sView s3 = Kokkos::subview( d3, 1, 1, 1 );
+    sView s4 = Kokkos::subview( d4, 1, 1, 1, 1 );
   }
 
   static void run_test_subview_strided()
   {
-    typedef Kokkos::View< int **** , Kokkos::LayoutLeft  , host >  view_left_4 ;
-    typedef Kokkos::View< int **** , Kokkos::LayoutRight , host >  view_right_4 ;
-    typedef Kokkos::View< int **   , Kokkos::LayoutLeft  , host >  view_left_2 ;
-    typedef Kokkos::View< int **   , Kokkos::LayoutRight , host >  view_right_2 ;
-
-    typedef Kokkos::View< int * ,  Kokkos::LayoutStride , host >  view_stride_1 ;
-    typedef Kokkos::View< int ** ,  Kokkos::LayoutStride , host >  view_stride_2 ;
-
-    view_left_2  xl2("xl2", 100 , 200 );
-    view_right_2 xr2("xr2", 100 , 200 );
-    view_stride_1  yl1 = Kokkos::subview( xl2 , 0 , Kokkos::ALL() );
-    view_stride_1  yl2 = Kokkos::subview( xl2 , 1 , Kokkos::ALL() );
-    view_stride_1  yr1 = Kokkos::subview( xr2 , 0 , Kokkos::ALL() );
-    view_stride_1  yr2 = Kokkos::subview( xr2 , 1 , Kokkos::ALL() );
-
-    ASSERT_EQ( yl1.dimension_0() , xl2.dimension_1() );
-    ASSERT_EQ( yl2.dimension_0() , xl2.dimension_1() );
-    ASSERT_EQ( yr1.dimension_0() , xr2.dimension_1() );
-    ASSERT_EQ( yr2.dimension_0() , xr2.dimension_1() );
-
-    ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 );
-    ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 );
-    ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 );
-    ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 );
-
-    view_left_4 xl4( "xl4" , 10 , 20 , 30 , 40 );
-    view_right_4 xr4( "xr4" , 10 , 20 , 30 , 40 );
-
-    view_stride_2 yl4 = Kokkos::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
-    view_stride_2 yr4 = Kokkos::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
-
-    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
-    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
-    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
-    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
-
-    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
-    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
+    typedef Kokkos::View< int ****, Kokkos::LayoutLeft , host >  view_left_4;
+    typedef Kokkos::View< int ****, Kokkos::LayoutRight, host >  view_right_4;
+    typedef Kokkos::View< int **  , Kokkos::LayoutLeft , host >  view_left_2;
+    typedef Kokkos::View< int **  , Kokkos::LayoutRight, host >  view_right_2;
+
+    typedef Kokkos::View< int * ,  Kokkos::LayoutStride, host >  view_stride_1;
+    typedef Kokkos::View< int **,  Kokkos::LayoutStride, host >  view_stride_2;
+
+    view_left_2  xl2( "xl2", 100, 200 );
+    view_right_2 xr2( "xr2", 100, 200 );
+    view_stride_1 yl1 = Kokkos::subview( xl2, 0, Kokkos::ALL() );
+    view_stride_1 yl2 = Kokkos::subview( xl2, 1, Kokkos::ALL() );
+    view_stride_1 yr1 = Kokkos::subview( xr2, 0, Kokkos::ALL() );
+    view_stride_1 yr2 = Kokkos::subview( xr2, 1, Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.dimension_0(), xl2.dimension_1() );
+    ASSERT_EQ( yl2.dimension_0(), xl2.dimension_1() );
+    ASSERT_EQ( yr1.dimension_0(), xr2.dimension_1() );
+    ASSERT_EQ( yr2.dimension_0(), xr2.dimension_1() );
+
+    ASSERT_EQ( & yl1( 0 ) - & xl2( 0, 0 ), 0 );
+    ASSERT_EQ( & yl2( 0 ) - & xl2( 1, 0 ), 0 );
+    ASSERT_EQ( & yr1( 0 ) - & xr2( 0, 0 ), 0 );
+    ASSERT_EQ( & yr2( 0 ) - & xr2( 1, 0 ), 0 );
+
+    view_left_4 xl4( "xl4", 10, 20, 30, 40 );
+    view_right_4 xr4( "xr4", 10, 20, 30, 40 );
+
+    view_stride_2 yl4 = Kokkos::subview( xl4, 1, Kokkos::ALL(), 2, Kokkos::ALL() );
+    view_stride_2 yr4 = Kokkos::subview( xr4, 1, Kokkos::ALL(), 2, Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.dimension_0(), xl4.dimension_1() );
+    ASSERT_EQ( yl4.dimension_1(), xl4.dimension_3() );
+    ASSERT_EQ( yr4.dimension_0(), xr4.dimension_1() );
+    ASSERT_EQ( yr4.dimension_1(), xr4.dimension_3() );
+
+    ASSERT_EQ( & yl4( 4, 4 ) - & xl4( 1, 4, 2, 4 ), 0 );
+    ASSERT_EQ( & yr4( 4, 4 ) - & xr4( 1, 4, 2, 4 ), 0 );
   }
 
   static void run_test_vector()
   {
-    static const unsigned Length = 1000 , Count = 8 ;
+    static const unsigned Length = 1000, Count = 8;
 
-    typedef Kokkos::View< T* ,  Kokkos::LayoutLeft , host > vector_type ;
-    typedef Kokkos::View< T** , Kokkos::LayoutLeft , host > multivector_type ;
+    typedef Kokkos::View< T*,  Kokkos::LayoutLeft, host > vector_type;
+    typedef Kokkos::View< T**, Kokkos::LayoutLeft, host > multivector_type;
 
-    typedef Kokkos::View< T* ,  Kokkos::LayoutRight , host > vector_right_type ;
-    typedef Kokkos::View< T** , Kokkos::LayoutRight , host > multivector_right_type ;
+    typedef Kokkos::View< T*,  Kokkos::LayoutRight, host > vector_right_type;
+    typedef Kokkos::View< T**, Kokkos::LayoutRight, host > multivector_right_type;
 
-    typedef Kokkos::View< const T* , Kokkos::LayoutRight, host > const_vector_right_type ;
-    typedef Kokkos::View< const T* , Kokkos::LayoutLeft , host > const_vector_type ;
-    typedef Kokkos::View< const T** , Kokkos::LayoutLeft , host > const_multivector_type ;
+    typedef Kokkos::View< const T*,  Kokkos::LayoutRight, host > const_vector_right_type;
+    typedef Kokkos::View< const T*,  Kokkos::LayoutLeft,  host > const_vector_type;
+    typedef Kokkos::View< const T**, Kokkos::LayoutLeft,  host > const_multivector_type;
 
-    multivector_type mv = multivector_type( "mv" , Length , Count );
-    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
+    multivector_type mv = multivector_type( "mv", Length, Count );
+    multivector_right_type mv_right = multivector_right_type( "mv", Length, Count );
 
-    vector_type v1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
-    vector_type v2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
-    vector_type v3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+    vector_type v1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    vector_type v2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    vector_type v3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    vector_type rv1 = Kokkos::subview( mv_right , 0 , Kokkos::ALL() );
-    vector_type rv2 = Kokkos::subview( mv_right , 1 , Kokkos::ALL() );
-    vector_type rv3 = Kokkos::subview( mv_right , 2 , Kokkos::ALL() );
+    vector_type rv1 = Kokkos::subview( mv_right, 0, Kokkos::ALL() );
+    vector_type rv2 = Kokkos::subview( mv_right, 1, Kokkos::ALL() );
+    vector_type rv3 = Kokkos::subview( mv_right, 2, Kokkos::ALL() );
 
-    multivector_type mv1 = Kokkos::subview( mv , std::make_pair( 1 , 998 ) ,
-                                                 std::make_pair( 2 , 5 ) );
+    multivector_type mv1 = Kokkos::subview( mv, std::make_pair( 1, 998 ),
+                                                std::make_pair( 2, 5 ) );
 
-    multivector_right_type mvr1 =
-      Kokkos::subview( mv_right ,
-                       std::make_pair( 1 , 998 ) ,
-                       std::make_pair( 2 , 5 ) );
+    multivector_right_type mvr1 = Kokkos::subview( mv_right, std::make_pair( 1, 998 ),
+                                                             std::make_pair( 2, 5 ) );
 
-    const_vector_type cv1 = Kokkos::subview( mv , Kokkos::ALL(), 0 );
-    const_vector_type cv2 = Kokkos::subview( mv , Kokkos::ALL(), 1 );
-    const_vector_type cv3 = Kokkos::subview( mv , Kokkos::ALL(), 2 );
+    const_vector_type cv1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    const_vector_type cv2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    const_vector_type cv3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    vector_right_type vr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
-    vector_right_type vr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
-    vector_right_type vr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+    vector_right_type vr1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    vector_right_type vr2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    vector_right_type vr3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    const_vector_right_type cvr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
-    const_vector_right_type cvr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
-    const_vector_right_type cvr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+    const_vector_right_type cvr1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    const_vector_right_type cvr2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    const_vector_right_type cvr3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    ASSERT_TRUE( & v1[0] == & v1(0) );
-    ASSERT_TRUE( & v1[0] == & mv(0,0) );
-    ASSERT_TRUE( & v2[0] == & mv(0,1) );
-    ASSERT_TRUE( & v3[0] == & mv(0,2) );
+    ASSERT_TRUE( & v1[0] == & v1( 0 ) );
+    ASSERT_TRUE( & v1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & v2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & v3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & cv1[0] == & mv(0,0) );
-    ASSERT_TRUE( & cv2[0] == & mv(0,1) );
-    ASSERT_TRUE( & cv3[0] == & mv(0,2) );
+    ASSERT_TRUE( & cv1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & cv2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & cv3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & vr1[0] == & mv(0,0) );
-    ASSERT_TRUE( & vr2[0] == & mv(0,1) );
-    ASSERT_TRUE( & vr3[0] == & mv(0,2) );
+    ASSERT_TRUE( & vr1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & vr2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & vr3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & cvr1[0] == & mv(0,0) );
-    ASSERT_TRUE( & cvr2[0] == & mv(0,1) );
-    ASSERT_TRUE( & cvr3[0] == & mv(0,2) );
+    ASSERT_TRUE( & cvr1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & cvr2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & cvr3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) );
-    ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) );
-    ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) );
-    ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) );
-    ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) );
-    ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) );
+    ASSERT_TRUE( & mv1( 0, 0 ) == & mv( 1, 2 ) );
+    ASSERT_TRUE( & mv1( 1, 1 ) == & mv( 2, 3 ) );
+    ASSERT_TRUE( & mv1( 3, 2 ) == & mv( 4, 4 ) );
+    ASSERT_TRUE( & mvr1( 0, 0 ) == & mv_right( 1, 2 ) );
+    ASSERT_TRUE( & mvr1( 1, 1 ) == & mv_right( 2, 3 ) );
+    ASSERT_TRUE( & mvr1( 3, 2 ) == & mv_right( 4, 4 ) );
 
     const_vector_type c_cv1( v1 );
     typename vector_type::const_type c_cv2( v2 );
@@ -1356,6 +1320,3 @@ public:
 };
 
 } // namespace Test
-
-/*--------------------------------------------------------------------------*/
-
diff --git a/lib/kokkos/core/unit_test/TestViewMapping.hpp b/lib/kokkos/core/unit_test/TestViewMapping.hpp
index 324f02e94730d99365804684776e48ac64c3a351..71604bed51d93e374c8de9776bb24d2135c95182 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -49,1126 +49,1140 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
 template< class Space >
 void test_view_mapping()
 {
-  typedef typename Space::execution_space ExecSpace ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<2> dim_s2 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<2,3> dim_s2_s3 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<2,3,4> dim_s2_s3_s4 ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<0> dim_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,3> dim_s0_s3 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,3,4> dim_s0_s3_s4 ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0> dim_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,4> dim_s0_s0_s4 ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0> dim_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0> dim_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0> dim_s0_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0_s0 ;
-
-  // Fully static dimensions should not be larger than an int
-  ASSERT_LE( sizeof(dim_0) , sizeof(int) );
-  ASSERT_LE( sizeof(dim_s2) , sizeof(int) );
-  ASSERT_LE( sizeof(dim_s2_s3) , sizeof(int) );
-  ASSERT_LE( sizeof(dim_s2_s3_s4) , sizeof(int) );
-
-  // Rank 1 is size_t
-  ASSERT_EQ( sizeof(dim_s0) , sizeof(size_t) );
-  ASSERT_EQ( sizeof(dim_s0_s3) , sizeof(size_t) );
-  ASSERT_EQ( sizeof(dim_s0_s3_s4) , sizeof(size_t) );
-
-  // Allow for padding
-  ASSERT_LE( sizeof(dim_s0_s0) , 2 * sizeof(size_t) );
-  ASSERT_LE( sizeof(dim_s0_s0_s4) , 2 * sizeof(size_t) );
-
-  ASSERT_LE( sizeof(dim_s0_s0_s0) , 4 * sizeof(size_t) );
-  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0) , 4 * sizeof(unsigned) );
-  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
-  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
-  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
-  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
-
-  static_assert( int(dim_0::rank) == int(0) , "" );
-  static_assert( int(dim_0::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_0::ArgN0) == 1 , "" );
-  static_assert( int(dim_0::ArgN1) == 1 , "" );
-  static_assert( int(dim_0::ArgN2) == 1 , "" );
-
-  static_assert( int(dim_s2::rank) == int(1) , "" );
-  static_assert( int(dim_s2::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_s2::ArgN0) == 2 , "" );
-  static_assert( int(dim_s2::ArgN1) == 1 , "" );
-
-  static_assert( int(dim_s2_s3::rank) == int(2) , "" );
-  static_assert( int(dim_s2_s3::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_s2_s3::ArgN0) == 2 , "" );
-  static_assert( int(dim_s2_s3::ArgN1) == 3 , "" );
-  static_assert( int(dim_s2_s3::ArgN2) == 1 , "" );
-
-  static_assert( int(dim_s2_s3_s4::rank) == int(3) , "" );
-  static_assert( int(dim_s2_s3_s4::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN0) == 2 , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN1) == 3 , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN2) == 4 , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN3) == 1 , "" );
-
-  static_assert( int(dim_s0::rank) == int(1) , "" );
-  static_assert( int(dim_s0::rank_dynamic) == int(1) , "" );
-
-  static_assert( int(dim_s0_s3::rank) == int(2) , "" );
-  static_assert( int(dim_s0_s3::rank_dynamic) == int(1) , "" );
-  static_assert( int(dim_s0_s3::ArgN0) == 0 , "" );
-  static_assert( int(dim_s0_s3::ArgN1) == 3 , "" );
-
-  static_assert( int(dim_s0_s3_s4::rank) == int(3) , "" );
-  static_assert( int(dim_s0_s3_s4::rank_dynamic) == int(1) , "" );
-  static_assert( int(dim_s0_s3_s4::ArgN0) == 0 , "" );
-  static_assert( int(dim_s0_s3_s4::ArgN1) == 3 , "" );
-  static_assert( int(dim_s0_s3_s4::ArgN2) == 4 , "" );
-
-  static_assert( int(dim_s0_s0_s4::rank) == int(3) , "" );
-  static_assert( int(dim_s0_s0_s4::rank_dynamic) == int(2) , "" );
-  static_assert( int(dim_s0_s0_s4::ArgN0) == 0 , "" );
-  static_assert( int(dim_s0_s0_s4::ArgN1) == 0 , "" );
-  static_assert( int(dim_s0_s0_s4::ArgN2) == 4 , "" );
-
-  static_assert( int(dim_s0_s0_s0::rank) == int(3) , "" );
-  static_assert( int(dim_s0_s0_s0::rank_dynamic) == int(3) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0::rank) == int(4) , "" );
-  static_assert( int(dim_s0_s0_s0_s0::rank_dynamic) == int(4) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0::rank) == int(5) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8) , "" );
-
-  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 ); 
+  typedef typename Space::execution_space ExecSpace;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 2 > dim_s2;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3 > dim_s2_s3;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0 > dim_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3 > dim_s0_s3;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0 > dim_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0;
+
+  // Fully static dimensions should not be larger than an int.
+  ASSERT_LE( sizeof( dim_0 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2_s3 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2_s3_s4 ), sizeof( int ) );
+
+  // Rank 1 is size_t.
+  ASSERT_EQ( sizeof( dim_s0 ), sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s3 ), sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s3_s4 ), sizeof( size_t ) );
+
+  // Allow for padding.
+  ASSERT_LE( sizeof( dim_s0_s0 ), 2 * sizeof( size_t ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s4 ), 2 * sizeof( size_t ) );
+
+  ASSERT_LE( sizeof( dim_s0_s0_s0 ), 4 * sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0 ), 4 * sizeof( unsigned ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s0_s0_s0 ), 6 * sizeof( unsigned ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0_s0_s0 ), 6 * sizeof( unsigned ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s0_s0_s0_s0_s0 ), 8 * sizeof( unsigned ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0_s0_s0_s0_s0 ), 8 * sizeof( unsigned ) );
+
+  static_assert( int( dim_0::rank ) == int( 0 ), "" );
+  static_assert( int( dim_0::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_0::ArgN0 ) == 1, "" );
+  static_assert( int( dim_0::ArgN1 ) == 1, "" );
+  static_assert( int( dim_0::ArgN2 ) == 1, "" );
+
+  static_assert( int( dim_s2::rank ) == int( 1 ), "" );
+  static_assert( int( dim_s2::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2::ArgN1 ) == 1, "" );
+
+  static_assert( int( dim_s2_s3::rank ) == int( 2 ), "" );
+  static_assert( int( dim_s2_s3::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2_s3::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2_s3::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s2_s3::ArgN2 ) == 1, "" );
+
+  static_assert( int( dim_s2_s3_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s2_s3_s4::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2_s3_s4::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN2 ) == 4, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN3 ) == 1, "" );
+
+  static_assert( int( dim_s0::rank ) == int( 1 ), "" );
+  static_assert( int( dim_s0::rank_dynamic ) == int( 1 ), "" );
+
+  static_assert( int( dim_s0_s3::rank ) == int( 2 ), "" );
+  static_assert( int( dim_s0_s3::rank_dynamic ) == int( 1 ), "" );
+  static_assert( int( dim_s0_s3::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s3::ArgN1 ) == 3, "" );
+
+  static_assert( int( dim_s0_s3_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s3_s4::rank_dynamic ) == int( 1 ), "" );
+  static_assert( int( dim_s0_s3_s4::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s3_s4::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s0_s3_s4::ArgN2 ) == 4, "" );
+
+  static_assert( int( dim_s0_s0_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s0_s4::rank_dynamic ) == int( 2 ), "" );
+  static_assert( int( dim_s0_s0_s4::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s0_s4::ArgN1 ) == 0, "" );
+  static_assert( int( dim_s0_s0_s4::ArgN2 ) == 4, "" );
+
+  static_assert( int( dim_s0_s0_s0::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s0_s0::rank_dynamic ) == int( 3 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0::rank ) == int( 4 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0::rank_dynamic ) == int( 4 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0::rank ) == int( 5 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0::rank_dynamic ) == int( 5 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0::rank ) == int( 6 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 6 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0::rank ) == int( 7 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 7 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0_s0::rank ) == int( 8 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 8 ), "" );
+
+  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 );
   dim_s0_s0       d2( 2, 3, 4, 5, 6, 7, 8, 9 );
   dim_s0_s0_s0    d3( 2, 3, 4, 5, 6, 7, 8, 9 );
   dim_s0_s0_s0_s0 d4( 2, 3, 4, 5, 6, 7, 8, 9 );
 
-  ASSERT_EQ( d1.N0 , 2 );
-  ASSERT_EQ( d2.N0 , 2 );
-  ASSERT_EQ( d3.N0 , 2 );
-  ASSERT_EQ( d4.N0 , 2 );
+  ASSERT_EQ( d1.N0, 2 );
+  ASSERT_EQ( d2.N0, 2 );
+  ASSERT_EQ( d3.N0, 2 );
+  ASSERT_EQ( d4.N0, 2 );
 
-  ASSERT_EQ( d1.N1 , 1 );
-  ASSERT_EQ( d2.N1 , 3 );
-  ASSERT_EQ( d3.N1 , 3 );
-  ASSERT_EQ( d4.N1 , 3 );
+  ASSERT_EQ( d1.N1, 1 );
+  ASSERT_EQ( d2.N1, 3 );
+  ASSERT_EQ( d3.N1, 3 );
+  ASSERT_EQ( d4.N1, 3 );
 
-  ASSERT_EQ( d1.N2 , 1 );
-  ASSERT_EQ( d2.N2 , 1 );
-  ASSERT_EQ( d3.N2 , 4 );
-  ASSERT_EQ( d4.N2 , 4 );
+  ASSERT_EQ( d1.N2, 1 );
+  ASSERT_EQ( d2.N2, 1 );
+  ASSERT_EQ( d3.N2, 4 );
+  ASSERT_EQ( d4.N2, 4 );
 
-  ASSERT_EQ( d1.N3 , 1 );
-  ASSERT_EQ( d2.N3 , 1 );
-  ASSERT_EQ( d3.N3 , 1 );
-  ASSERT_EQ( d4.N3 , 5 );
+  ASSERT_EQ( d1.N3, 1 );
+  ASSERT_EQ( d2.N3, 1 );
+  ASSERT_EQ( d3.N3, 1 );
+  ASSERT_EQ( d4.N3, 5 );
 
   //----------------------------------------
 
-  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0 , Kokkos::LayoutStride >  stride_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0;
 
   //----------------------------------------
-  // Static dimension
+  // Static dimension.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutLeft > left_s2_s3_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4;
 
-    ASSERT_EQ( sizeof(left_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+    ASSERT_EQ( sizeof( left_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
 
-    left_s2_s3_s4 off3 ;
+    left_s2_s3_s4 off3;
 
-    stride_s0_s0_s0  stride3( off3 );
+    stride_s0_s0_s0 stride3( off3 );
 
-    ASSERT_EQ( off3.stride_0() , 1 );
-    ASSERT_EQ( off3.stride_1() , 2 );
-    ASSERT_EQ( off3.stride_2() , 6 );
-    ASSERT_EQ( off3.span() , 24 );
+    ASSERT_EQ( off3.stride_0(), 1 );
+    ASSERT_EQ( off3.stride_1(), 2 );
+    ASSERT_EQ( off3.stride_2(), 6 );
+    ASSERT_EQ( off3.span(), 24 );
 
-    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( off3.span() , stride3.span() );
+    ASSERT_EQ( off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( off3.span(), stride3.span() );
 
-    int offset = 0 ;
+    int offset = 0;
 
-    for ( int k = 0 ; k < 4 ; ++k ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
-      ASSERT_EQ( off3(i,j,k) , offset );
-      ASSERT_EQ( stride3(i,j,k) , off3(i,j,k) );
-    }}}
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < 3; ++j )
+    for ( int i = 0; i < 2; ++i, ++offset )
+    {
+      ASSERT_EQ( off3( i, j, k ), offset );
+      ASSERT_EQ( stride3( i, j, k ), off3( i, j, k ) );
+    }
   }
 
   //----------------------------------------
-  // Small dimension is unpadded
+  // Small dimension is unpadded.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
-    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), 2 * 3 * 4 );
 
     const Kokkos::LayoutLeft layout = dyn_off3.layout();
 
-    ASSERT_EQ( layout.dimension[0] , 2 );
-    ASSERT_EQ( layout.dimension[1] , 3 );
-    ASSERT_EQ( layout.dimension[2] , 4 );
-    ASSERT_EQ( layout.dimension[3] , 1 );
-    ASSERT_EQ( layout.dimension[4] , 1 );
-    ASSERT_EQ( layout.dimension[5] , 1 );
-    ASSERT_EQ( layout.dimension[6] , 1 );
-    ASSERT_EQ( layout.dimension[7] , 1 );
-
-    ASSERT_EQ( stride3.m_dim.rank , 3 );
-    ASSERT_EQ( stride3.m_dim.N0 , 2 );
-    ASSERT_EQ( stride3.m_dim.N1 , 3 );
-    ASSERT_EQ( stride3.m_dim.N2 , 4 );
-    ASSERT_EQ( stride3.m_dim.N3 , 1 );
-    ASSERT_EQ( stride3.size() , 2 * 3 * 4 );
-
-    int offset = 0 ;
-
-    for ( int k = 0 ; k < 4 ; ++k ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
-      ASSERT_EQ( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-    }}}
-
-    ASSERT_EQ( dyn_off3.span() , offset );
-    ASSERT_EQ( stride3.span() , dyn_off3.span() );
+    ASSERT_EQ( layout.dimension[0], 2 );
+    ASSERT_EQ( layout.dimension[1], 3 );
+    ASSERT_EQ( layout.dimension[2], 4 );
+    ASSERT_EQ( layout.dimension[3], 1 );
+    ASSERT_EQ( layout.dimension[4], 1 );
+    ASSERT_EQ( layout.dimension[5], 1 );
+    ASSERT_EQ( layout.dimension[6], 1 );
+    ASSERT_EQ( layout.dimension[7], 1 );
+
+    ASSERT_EQ( stride3.m_dim.rank, 3 );
+    ASSERT_EQ( stride3.m_dim.N0, 2 );
+    ASSERT_EQ( stride3.m_dim.N1, 3 );
+    ASSERT_EQ( stride3.m_dim.N2, 4 );
+    ASSERT_EQ( stride3.m_dim.N3, 1 );
+    ASSERT_EQ( stride3.size(), 2 * 3 * 4 );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < 3; ++j )
+    for ( int i = 0; i < 2; ++i, ++offset )
+    {
+      ASSERT_EQ( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
+
+    ASSERT_EQ( dyn_off3.span(), offset );
+    ASSERT_EQ( stride3.span(), dyn_off3.span() );
   }
 
-  // Large dimension is likely padded
+  //----------------------------------------
+  // Large dimension is likely padded.
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
-    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
-
-    ASSERT_EQ( stride3.m_dim.rank , 3 );
-    ASSERT_EQ( stride3.m_dim.N0 , N0 );
-    ASSERT_EQ( stride3.m_dim.N1 , N1 );
-    ASSERT_EQ( stride3.m_dim.N2 , 4 );
-    ASSERT_EQ( stride3.m_dim.N3 , 1 );
-    ASSERT_EQ( stride3.size() , N0 * N1 * 4 );
-    ASSERT_EQ( stride3.span() , dyn_off3.span() );
-
-    int offset = 0 ;
-
-    for ( int k = 0 ; k < 4 ; ++k ){
-    for ( int j = 0 ; j < N1 ; ++j ){
-    for ( int i = 0 ; i < N0 ; ++i ){
-      ASSERT_LE( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-      offset = dyn_off3(i,j,k) + 1 ;
-    }}}
-
-    ASSERT_LE( offset , dyn_off3.span() );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), N0 * N1 * 4 );
+
+    ASSERT_EQ( stride3.m_dim.rank, 3 );
+    ASSERT_EQ( stride3.m_dim.N0, N0 );
+    ASSERT_EQ( stride3.m_dim.N1, N1 );
+    ASSERT_EQ( stride3.m_dim.N2, 4 );
+    ASSERT_EQ( stride3.m_dim.N3, 1 );
+    ASSERT_EQ( stride3.size(), N0 * N1 * 4 );
+    ASSERT_EQ( stride3.span(), dyn_off3.span() );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < N1; ++j )
+    for ( int i = 0; i < N0; ++i )
+    {
+      ASSERT_LE( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+      offset = dyn_off3( i, j, k ) + 1;
+    }
+
+    ASSERT_LE( offset, dyn_off3.span() );
   }
 
   //----------------------------------------
-  // Static dimension
+  // Static dimension.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutRight > right_s2_s3_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4;
 
-    ASSERT_EQ( sizeof(right_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+    ASSERT_EQ( sizeof( right_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
 
-    right_s2_s3_s4 off3 ;
+    right_s2_s3_s4 off3;
 
     stride_s0_s0_s0  stride3( off3 );
 
-    ASSERT_EQ( off3.stride_0() , 12 );
-    ASSERT_EQ( off3.stride_1() , 4 );
-    ASSERT_EQ( off3.stride_2() , 1 );
+    ASSERT_EQ( off3.stride_0(), 12 );
+    ASSERT_EQ( off3.stride_1(), 4 );
+    ASSERT_EQ( off3.stride_2(), 1 );
 
-    ASSERT_EQ( off3.dimension_0() , stride3.dimension_0() );
-    ASSERT_EQ( off3.dimension_1() , stride3.dimension_1() );
-    ASSERT_EQ( off3.dimension_2() , stride3.dimension_2() );
-    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( off3.span() , stride3.span() );
+    ASSERT_EQ( off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( off3.span(), stride3.span() );
 
-    int offset = 0 ;
+    int offset = 0;
 
-    for ( int i = 0 ; i < 2 ; ++i ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
-      ASSERT_EQ( off3(i,j,k) , offset );
-      ASSERT_EQ( off3(i,j,k) , stride3(i,j,k) );
-    }}}
+    for ( int i = 0; i < 2; ++i )
+    for ( int j = 0; j < 3; ++j )
+    for ( int k = 0; k < 4; ++k, ++offset )
+    {
+      ASSERT_EQ( off3( i, j, k ), offset );
+      ASSERT_EQ( off3( i, j, k ), stride3( i, j, k ) );
+    }
 
-    ASSERT_EQ( off3.span() , offset );
+    ASSERT_EQ( off3.span(), offset );
   }
 
   //----------------------------------------
-  // Small dimension is unpadded
+  // Small dimension is unpadded.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
-    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
-
-    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
-    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
-    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( dyn_off3.span() , stride3.span() );
-
-    int offset = 0 ;
-
-    for ( int i = 0 ; i < 2 ; ++i ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
-      ASSERT_EQ( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
-    }}}
-
-    ASSERT_EQ( dyn_off3.span() , offset );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), 2 * 3 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < 2; ++i )
+    for ( int j = 0; j < 3; ++j )
+    for ( int k = 0; k < 4; ++k, ++offset )
+    {
+      ASSERT_EQ( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( dyn_off3( i, j, k ), stride3( i, j, k ) );
+    }
+
+    ASSERT_EQ( dyn_off3.span(), offset );
   }
 
-  // Large dimension is likely padded
+  //----------------------------------------
+  // Large dimension is likely padded.
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
-    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
-
-    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
-    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
-    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( dyn_off3.span() , stride3.span() );
-
-    int offset = 0 ;
-
-    for ( int i = 0 ; i < N0 ; ++i ){
-    for ( int j = 0 ; j < N1 ; ++j ){
-    for ( int k = 0 ; k < 4 ; ++k ){
-      ASSERT_LE( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
-      offset = dyn_off3(i,j,k) + 1 ;
-    }}}
-
-    ASSERT_LE( offset , dyn_off3.span() );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), N0 * N1 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < N0; ++i )
+    for ( int j = 0; j < N1; ++j )
+    for ( int k = 0; k < 4; ++k )
+    {
+      ASSERT_LE( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( dyn_off3( i, j, k ), stride3( i, j, k ) );
+      offset = dyn_off3( i, j, k ) + 1;
+    }
+
+    ASSERT_LE( offset, dyn_off3.span() );
   }
 
   //----------------------------------------
-  // Subview
+  // Subview.
   {
     // Mapping rank 4 to rank 3
-    typedef Kokkos::Experimental::Impl::SubviewExtents<4,3> SubviewExtents ;
+    typedef Kokkos::Experimental::Impl::SubviewExtents< 4, 3 > SubviewExtents;
 
-    constexpr int N0 = 1000 ;
-    constexpr int N1 = 2000 ;
-    constexpr int N2 = 3000 ;
-    constexpr int N3 = 4000 ;
+    constexpr int N0 = 1000;
+    constexpr int N1 = 2000;
+    constexpr int N2 = 3000;
+    constexpr int N3 = 4000;
 
-    Kokkos::Experimental::Impl::ViewDimension<N0,N1,N2,N3> dim ;
+    Kokkos::Experimental::Impl::ViewDimension< N0, N1, N2, N3 > dim;
 
     SubviewExtents tmp( dim
                       , N0 / 2
                       , Kokkos::Experimental::ALL
-                      , std::pair<int,int>( N2 / 4 , 10 + N2 / 4 )
-                      , Kokkos::pair<int,int>( N3 / 4 , 20 + N3 / 4 )
+                      , std::pair< int, int >( N2 / 4, 10 + N2 / 4 )
+                      , Kokkos::pair< int, int >( N3 / 4, 20 + N3 / 4 )
                       );
 
-    ASSERT_EQ( tmp.domain_offset(0) , N0 / 2 );
-    ASSERT_EQ( tmp.domain_offset(1) , 0 );
-    ASSERT_EQ( tmp.domain_offset(2) , N2 / 4 );
-    ASSERT_EQ( tmp.domain_offset(3) , N3 / 4 );
+    ASSERT_EQ( tmp.domain_offset( 0 ), N0 / 2 );
+    ASSERT_EQ( tmp.domain_offset( 1 ), 0 );
+    ASSERT_EQ( tmp.domain_offset( 2 ), N2 / 4 );
+    ASSERT_EQ( tmp.domain_offset( 3 ), N3 / 4 );
 
-    ASSERT_EQ( tmp.range_index(0) , 1 );
-    ASSERT_EQ( tmp.range_index(1) , 2 );
-    ASSERT_EQ( tmp.range_index(2) , 3 );
+    ASSERT_EQ( tmp.range_index( 0 ), 1 );
+    ASSERT_EQ( tmp.range_index( 1 ), 2 );
+    ASSERT_EQ( tmp.range_index( 2 ), 3 );
 
-    ASSERT_EQ( tmp.range_extent(0) , N1 );
-    ASSERT_EQ( tmp.range_extent(1) , 10 );
-    ASSERT_EQ( tmp.range_extent(2) , 20 );
+    ASSERT_EQ( tmp.range_extent( 0 ), N1 );
+    ASSERT_EQ( tmp.range_extent( 1 ), 10 );
+    ASSERT_EQ( tmp.range_extent( 2 ), 20 );
   }
-  //----------------------------------------
+
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    constexpr int sub_N0 = 1000 ;
-    constexpr int sub_N1 = 200 ;
-    constexpr int sub_N2 = 4 ;
+    constexpr int sub_N0 = 1000;
+    constexpr int sub_N1 = 200;
+    constexpr int sub_N2 = 4;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
-    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
-    Kokkos::Experimental::Impl::SubviewExtents< 3 , 3 >
+    Kokkos::Experimental::Impl::SubviewExtents< 3, 3 >
       sub( dyn_off3.m_dim
-         , Kokkos::pair<int,int>(0,sub_N0)
-         , Kokkos::pair<int,int>(0,sub_N1)
-         , Kokkos::pair<int,int>(0,sub_N2)
+         , Kokkos::pair< int, int >( 0, sub_N0 )
+         , Kokkos::pair< int, int >( 0, sub_N1 )
+         , Kokkos::pair< int, int >( 0, sub_N2 )
          );
 
-    stride_s0_s0_s0  stride3( dyn_off3 , sub );
+    stride_s0_s0_s0  stride3( dyn_off3, sub );
 
-    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
-    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
-    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
-    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+    ASSERT_EQ( stride3.dimension_0(), sub_N0 );
+    ASSERT_EQ( stride3.dimension_1(), sub_N1 );
+    ASSERT_EQ( stride3.dimension_2(), sub_N2 );
+    ASSERT_EQ( stride3.size(), sub_N0 * sub_N1 * sub_N2 );
 
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()    , stride3.span() );
 
-    for ( int k = 0 ; k < sub_N2 ; ++k ){
-    for ( int j = 0 ; j < sub_N1 ; ++j ){
-    for ( int i = 0 ; i < sub_N0 ; ++i ){
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-    }}}
+    for ( int k = 0; k < sub_N2; ++k )
+    for ( int j = 0; j < sub_N1; ++j )
+    for ( int i = 0; i < sub_N0; ++i )
+    {
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
   }
 
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    constexpr int sub_N0 = 1000 ;
-    constexpr int sub_N1 = 200 ;
-    constexpr int sub_N2 = 4 ;
+    constexpr int sub_N0 = 1000;
+    constexpr int sub_N1 = 200;
+    constexpr int sub_N2 = 4;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
-    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
-    Kokkos::Experimental::Impl::SubviewExtents< 3 , 3 >
+    Kokkos::Experimental::Impl::SubviewExtents< 3, 3 >
       sub( dyn_off3.m_dim
-         , Kokkos::pair<int,int>(0,sub_N0)
-         , Kokkos::pair<int,int>(0,sub_N1)
-         , Kokkos::pair<int,int>(0,sub_N2)
+         , Kokkos::pair< int, int >( 0, sub_N0 )
+         , Kokkos::pair< int, int >( 0, sub_N1 )
+         , Kokkos::pair< int, int >( 0, sub_N2 )
          );
 
-    stride_s0_s0_s0  stride3( dyn_off3 , sub );
+    stride_s0_s0_s0  stride3( dyn_off3, sub );
 
-    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
-    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
-    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
-    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+    ASSERT_EQ( stride3.dimension_0(), sub_N0 );
+    ASSERT_EQ( stride3.dimension_1(), sub_N1 );
+    ASSERT_EQ( stride3.dimension_2(), sub_N2 );
+    ASSERT_EQ( stride3.size(), sub_N0 * sub_N1 * sub_N2 );
 
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()    , stride3.span() );
 
-    for ( int i = 0 ; i < sub_N0 ; ++i ){
-    for ( int j = 0 ; j < sub_N1 ; ++j ){
-    for ( int k = 0 ; k < sub_N2 ; ++k ){
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-    }}}
+    for ( int i = 0; i < sub_N0; ++i )
+    for ( int j = 0; j < sub_N1; ++j )
+    for ( int k = 0; k < sub_N2; ++k )
+    {
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
   }
 
   //----------------------------------------
-  // view data analysis
+  // View data analysis.
   {
-    using namespace Kokkos::Experimental::Impl ;
-    static_assert( rank_dynamic<>::value == 0 , "" );
-    static_assert( rank_dynamic<1>::value == 0 , "" );
-    static_assert( rank_dynamic<0>::value == 1 , "" );
-    static_assert( rank_dynamic<0,1>::value == 1 , "" );
-    static_assert( rank_dynamic<0,0,1>::value == 2 , "" );
+    using namespace Kokkos::Experimental::Impl;
+
+    static_assert( rank_dynamic<>::value == 0, "" );
+    static_assert( rank_dynamic< 1 >::value == 0, "" );
+    static_assert( rank_dynamic< 0 >::value == 1, "" );
+    static_assert( rank_dynamic< 0, 1 >::value == 1, "" );
+    static_assert( rank_dynamic< 0, 0, 1 >::value == 2, "" );
   }
 
   {
-    using namespace Kokkos::Experimental::Impl ;
-
-    typedef ViewArrayAnalysis< int[] >                 a_int_r1 ;
-    typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5 ;
-    typedef ViewArrayAnalysis< const int[] >           a_const_int_r1 ;
-    typedef ViewArrayAnalysis< const int**[4][5][6] >  a_const_int_r5 ;
-
-    static_assert( a_int_r1::dimension::rank == 1 , "" );
-    static_assert( a_int_r1::dimension::rank_dynamic == 1 , "" );
-    static_assert( a_int_r5::dimension::ArgN0 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN1 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN2 == 4 , "" );
-    static_assert( a_int_r5::dimension::ArgN3 == 5 , "" );
-    static_assert( a_int_r5::dimension::ArgN4 == 6 , "" );
-    static_assert( a_int_r5::dimension::ArgN5 == 1 , "" );
-
-    static_assert( std::is_same< typename a_int_r1::dimension , ViewDimension<0> >::value , "" );
-    static_assert( std::is_same< typename a_int_r1::non_const_value_type , int >::value , "" );
-
-    static_assert( a_const_int_r1::dimension::rank == 1 , "" );
-    static_assert( a_const_int_r1::dimension::rank_dynamic == 1 , "" );
-    static_assert( std::is_same< typename a_const_int_r1::dimension , ViewDimension<0> >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value , "" );
-
-    static_assert( a_const_int_r5::dimension::rank == 5 , "" );
-    static_assert( a_const_int_r5::dimension::rank_dynamic == 2 , "" );
-
-    static_assert( a_const_int_r5::dimension::ArgN0 == 0 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN1 == 0 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN2 == 4 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN3 == 5 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN4 == 6 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN5 == 1 , "" );
-
-    static_assert( std::is_same< typename a_const_int_r5::dimension , ViewDimension<0,0,4,5,6> >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r5::non_const_value_type , int >::value , "" );
-
-    static_assert( a_int_r5::dimension::rank == 5 , "" );
-    static_assert( a_int_r5::dimension::rank_dynamic == 2 , "" );
-    static_assert( std::is_same< typename a_int_r5::dimension , ViewDimension<0,0,4,5,6> >::value , "" );
-    static_assert( std::is_same< typename a_int_r5::non_const_value_type , int >::value , "" );
+    using namespace Kokkos::Experimental::Impl;
+
+    typedef ViewArrayAnalysis< int[] >                 a_int_r1;
+    typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5;
+    typedef ViewArrayAnalysis< const int[] >           a_const_int_r1;
+    typedef ViewArrayAnalysis< const int**[4][5][6] >  a_const_int_r5;
+
+    static_assert( a_int_r1::dimension::rank == 1, "" );
+    static_assert( a_int_r1::dimension::rank_dynamic == 1, "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 4, "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 5, "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 6, "" );
+    static_assert( a_int_r5::dimension::ArgN5 == 1, "" );
+
+    static_assert( std::is_same< typename a_int_r1::dimension, ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_int_r1::non_const_value_type, int >::value, "" );
+
+    static_assert( a_const_int_r1::dimension::rank == 1, "" );
+    static_assert( a_const_int_r1::dimension::rank_dynamic == 1, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type, int >::value, "" );
+
+    static_assert( a_const_int_r5::dimension::rank == 5, "" );
+    static_assert( a_const_int_r5::dimension::rank_dynamic == 2, "" );
+
+    static_assert( a_const_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_const_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_const_int_r5::dimension::ArgN2 == 4, "" );
+    static_assert( a_const_int_r5::dimension::ArgN3 == 5, "" );
+    static_assert( a_const_int_r5::dimension::ArgN4 == 6, "" );
+    static_assert( a_const_int_r5::dimension::ArgN5 == 1, "" );
+
+    static_assert( std::is_same< typename a_const_int_r5::dimension, ViewDimension<0, 0, 4, 5, 6> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r5::non_const_value_type, int >::value, "" );
+
+    static_assert( a_int_r5::dimension::rank == 5, "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 2, "" );
+    static_assert( std::is_same< typename a_int_r5::dimension, ViewDimension<0, 0, 4, 5, 6> >::value, "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type, int >::value, "" );
   }
 
   {
-    using namespace Kokkos::Experimental::Impl ;
+    using namespace Kokkos::Experimental::Impl;
 
-    typedef int t_i4[4] ;
+    typedef int t_i4[4];
 
     // Dimensions of t_i4 are appended to the multdimensional array.
-    typedef ViewArrayAnalysis< t_i4 ***[3] > a_int_r5 ;
-
-    static_assert( a_int_r5::dimension::rank == 5 , "" );
-    static_assert( a_int_r5::dimension::rank_dynamic == 3 , "" );
-    static_assert( a_int_r5::dimension::ArgN0 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN1 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN2 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN3 == 3 , "" );
-    static_assert( a_int_r5::dimension::ArgN4 == 4 , "" );
-    static_assert( std::is_same< typename a_int_r5::non_const_value_type , int >::value , "" );
+    typedef ViewArrayAnalysis< t_i4 ***[3] > a_int_r5;
+
+    static_assert( a_int_r5::dimension::rank == 5, "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 3, "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 3, "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 4, "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type, int >::value, "" );
   }
 
   {
-    using namespace Kokkos::Experimental::Impl ;
+    using namespace Kokkos::Experimental::Impl;
 
-    typedef ViewDataAnalysis< const int[] , void >  a_const_int_r1 ;
+    typedef ViewDataAnalysis< const int[], void >  a_const_int_r1;
 
-    static_assert( std::is_same< typename a_const_int_r1::specialize , void >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::dimension , Kokkos::Experimental::Impl::ViewDimension<0> >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::specialize, void >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Experimental::Impl::ViewDimension<0> >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r1::type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::value_type , const int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::value_type, const int >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r1::scalar_array_type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::const_type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::const_value_type , const int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::const_scalar_array_type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::non_const_type , int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::scalar_array_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_scalar_array_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_type, int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type, int >::value, "" );
 
-    typedef ViewDataAnalysis< const int**[4] , void >  a_const_int_r3 ;
+    typedef ViewDataAnalysis< const int**[4], void >  a_const_int_r3;
 
-    static_assert( std::is_same< typename a_const_int_r3::specialize , void >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::specialize, void >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r3::dimension , Kokkos::Experimental::Impl::ViewDimension<0,0,4> >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Experimental::Impl::ViewDimension<0, 0, 4> >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r3::type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::value_type , const int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::scalar_array_type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::const_type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::const_value_type , const int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::const_scalar_array_type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::non_const_type , int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::non_const_value_type , int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::non_const_scalar_array_type , int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::scalar_array_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_scalar_array_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_type, int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_value_type, int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_scalar_array_type, int**[4] >::value, "" );
 
-
-    // std::cout << "typeid(const int**[4]).name() = " << typeid(const int**[4]).name() << std::endl ;
+    // std::cout << "typeid( const int**[4] ).name() = " << typeid( const int**[4] ).name() << std::endl;
   }
 
   //----------------------------------------
 
   {
-    constexpr int N = 10 ;
+    constexpr int N = 10;
 
-    typedef Kokkos::View<int*,Space>        T ;
-    typedef Kokkos::View<const int*,Space>  C ;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
 
-    int data[N] ;
+    int data[N];
 
-    T vr1(data,N); // view of non-const
-    C cr1(vr1);    // view of const from view of non-const
-    C cr2( (const int *) data , N );
+    T vr1( data, N ); // View of non-const.
+    C cr1( vr1 );     // View of const from view of non-const.
+    C cr2( (const int *) data, N );
 
     // Generate static_assert error:
     // T tmp( cr1 );
 
-    ASSERT_EQ( vr1.span() , N );
-    ASSERT_EQ( cr1.span() , N );
-    ASSERT_EQ( vr1.data() , & data[0] );
-    ASSERT_EQ( cr1.data() , & data[0] );
+    ASSERT_EQ( vr1.span(), N );
+    ASSERT_EQ( cr1.span(), N );
+    ASSERT_EQ( vr1.data(), & data[0] );
+    ASSERT_EQ( cr1.data(), & data[0] );
 
-    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::data_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::value_type          , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type, int >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename Space::memory_space >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type, int & >::value ) );
 
-    ASSERT_EQ( T::Rank , 1 );
+    ASSERT_EQ( T::Rank, 1 );
 
-    ASSERT_TRUE( ( std::is_same< typename C::data_type           , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::const_data_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::data_type          , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename C::scalar_array_type           , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::const_scalar_array_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::non_const_scalar_array_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::scalar_array_type          , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_scalar_array_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename C::value_type           , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::const_value_type     , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::value_type          , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type, int >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename C::memory_space , typename Space::memory_space >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::reference_type , const int & >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::reference_type, const int & >::value ) );
 
-    ASSERT_EQ( C::Rank , 1 );
+    ASSERT_EQ( C::Rank, 1 );
 
-    ASSERT_EQ( vr1.dimension_0() , N );
+    ASSERT_EQ( vr1.dimension_0(), N );
 
-    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , typename Space::memory_space >::accessible ) {
-      for ( int i = 0 ; i < N ; ++i ) data[i] = i + 1 ;
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+      for ( int i = 0; i < N; ++i ) data[i] = i + 1;
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 1 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( cr1[i], i + 1 );
 
       {
         T tmp( vr1 );
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
-        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 1 );
+        for ( int i = 0; i < N; ++i ) vr1( i ) = i + 2;
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 2 );
       }
 
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 2 );
     }
   }
 
-
   {
-    constexpr int N = 10 ;
-    typedef Kokkos::View<int*,Space>        T ;
-    typedef Kokkos::View<const int*,Space>  C ;
+    constexpr int N = 10;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
+
+    T vr1( "vr1", N );
+    C cr1( vr1 );
 
-    T vr1("vr1",N);
-    C cr1(vr1);
+    ASSERT_TRUE( ( std::is_same< typename T::data_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::value_type          , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type, int >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type, int & >::value ) );
+    ASSERT_EQ( T::Rank, 1 );
 
-    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename Space::memory_space >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
-    ASSERT_EQ( T::Rank , 1 );
- 
-    ASSERT_EQ( vr1.dimension_0() , N );
+    ASSERT_EQ( vr1.dimension_0(), N );
 
-    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , typename Space::memory_space >::accessible ) {
-      for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 1 ;
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+      for ( int i = 0; i < N; ++i ) vr1( i ) = i + 1;
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 1 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( cr1[i], i + 1 );
 
       {
         T tmp( vr1 );
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
-        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 1 );
+        for ( int i = 0; i < N; ++i ) vr1( i ) = i + 2;
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 2 );
       }
 
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 2 );
     }
   }
 
-  // Testing proper handling of zero-length allocations
+  // Testing proper handling of zero-length allocations.
   {
-    constexpr int N = 0 ;
-    typedef Kokkos::View<int*,Space>        T ;
-    typedef Kokkos::View<const int*,Space>  C ;
+    constexpr int N = 0;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
 
-    T vr1("vr1",N);
-    C cr1(vr1);
+    T vr1( "vr1", N );
+    C cr1( vr1 );
 
-    ASSERT_EQ( vr1.dimension_0() , 0 );
-    ASSERT_EQ( cr1.dimension_0() , 0 );
+    ASSERT_EQ( vr1.dimension_0(), 0 );
+    ASSERT_EQ( cr1.dimension_0(), 0 );
   }
 
-
   // Testing using space instance for allocation.
-  // The execution space of the memory space must be available for view data initialization
-
-  if ( std::is_same< ExecSpace , typename ExecSpace::memory_space::execution_space >::value ) {
-
-    using namespace Kokkos::Experimental ;
-
-    typedef typename ExecSpace::memory_space  memory_space ;
-    typedef View<int*,memory_space>           V ;
-
-    constexpr int N = 10 ;
-
-    memory_space mem_space ;
-
-    V v( "v" , N );
-    V va( view_alloc() , N );
-    V vb( view_alloc( "vb" ) , N );
-    V vc( view_alloc( "vc" , AllowPadding ) , N );
-    V vd( view_alloc( "vd" , WithoutInitializing ) , N );
-    V ve( view_alloc( "ve" , WithoutInitializing , AllowPadding ) , N );
-    V vf( view_alloc( "vf" , mem_space , WithoutInitializing , AllowPadding ) , N );
-    V vg( view_alloc( mem_space , "vg" , WithoutInitializing , AllowPadding ) , N );
-    V vh( view_alloc( WithoutInitializing , AllowPadding ) , N );
-    V vi( view_alloc( WithoutInitializing ) , N );
-    V vj( view_alloc( std::string("vj") , AllowPadding ) , N );
-    V vk( view_alloc( mem_space , std::string("vk") , AllowPadding ) , N );
+  // The execution space of the memory space must be available for view data initialization.
+  if ( std::is_same< ExecSpace, typename ExecSpace::memory_space::execution_space >::value ) {
+
+    using namespace Kokkos::Experimental;
+
+    typedef typename ExecSpace::memory_space  memory_space;
+    typedef View< int*, memory_space >        V;
+
+    constexpr int N = 10;
+
+    memory_space mem_space;
+
+    V v( "v", N );
+    V va( view_alloc(), N );
+    V vb( view_alloc( "vb" ), N );
+    V vc( view_alloc( "vc", AllowPadding ), N );
+    V vd( view_alloc( "vd", WithoutInitializing ), N );
+    V ve( view_alloc( "ve", WithoutInitializing, AllowPadding ), N );
+    V vf( view_alloc( "vf", mem_space, WithoutInitializing, AllowPadding ), N );
+    V vg( view_alloc( mem_space, "vg", WithoutInitializing, AllowPadding ), N );
+    V vh( view_alloc( WithoutInitializing, AllowPadding ), N );
+    V vi( view_alloc( WithoutInitializing ), N );
+    V vj( view_alloc( std::string( "vj" ), AllowPadding ), N );
+    V vk( view_alloc( mem_space, std::string( "vk" ), AllowPadding ), N );
   }
 
   {
-    typedef Kokkos::ViewTraits<int***,Kokkos::LayoutStride,ExecSpace>  traits_t ;
-    typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0>                         dims_t ;
-    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t , Kokkos::LayoutStride >  offset_t ;
+    typedef Kokkos::ViewTraits< int***, Kokkos::LayoutStride, ExecSpace >           traits_t;
+    typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 >                    dims_t;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t, Kokkos::LayoutStride >  offset_t;
 
-    Kokkos::LayoutStride stride ;
+    Kokkos::LayoutStride stride;
 
-    stride.dimension[0] = 3 ;
-    stride.dimension[1] = 4 ;
-    stride.dimension[2] = 5 ;
-    stride.stride[0] = 4 ;
-    stride.stride[1] = 1 ;
-    stride.stride[2] = 12 ;
+    stride.dimension[0] = 3;
+    stride.dimension[1] = 4;
+    stride.dimension[2] = 5;
+    stride.stride[0] = 4;
+    stride.stride[1] = 1;
+    stride.stride[2] = 12;
 
-    const offset_t offset( std::integral_constant<unsigned,0>() , stride );
+    const offset_t offset( std::integral_constant< unsigned, 0 >(), stride );
 
-    ASSERT_EQ( offset.dimension_0() , 3 );
-    ASSERT_EQ( offset.dimension_1() , 4 );
-    ASSERT_EQ( offset.dimension_2() , 5 );
+    ASSERT_EQ( offset.dimension_0(), 3 );
+    ASSERT_EQ( offset.dimension_1(), 4 );
+    ASSERT_EQ( offset.dimension_2(), 5 );
 
-    ASSERT_EQ( offset.stride_0() , 4 );
-    ASSERT_EQ( offset.stride_1() , 1 );
-    ASSERT_EQ( offset.stride_2() , 12 );
+    ASSERT_EQ( offset.stride_0(), 4 );
+    ASSERT_EQ( offset.stride_1(), 1 );
+    ASSERT_EQ( offset.stride_2(), 12 );
 
-    ASSERT_EQ( offset.span() , 60 );
+    ASSERT_EQ( offset.span(), 60 );
     ASSERT_TRUE( offset.span_is_contiguous() );
 
-    Kokkos::Experimental::Impl::ViewMapping< traits_t , void >
-      v( Kokkos::Experimental::Impl::ViewCtorProp<int*>((int*)0), stride );
+    Kokkos::Experimental::Impl::ViewMapping< traits_t, void >
+      v( Kokkos::Experimental::Impl::ViewCtorProp< int* >( (int*) 0 ), stride );
   }
 
   {
-    typedef Kokkos::View<int**,Space>  V ;
-    typedef typename V::HostMirror  M ;
-    typedef typename Kokkos::View<int**,Space>::array_layout layout_type;
+    typedef Kokkos::View< int**, Space > V;
+    typedef typename V::HostMirror M;
+    typedef typename Kokkos::View< int**, Space >::array_layout layout_type;
 
-    constexpr int N0 = 10 ;
-    constexpr int N1 = 11 ;
+    constexpr int N0 = 10;
+    constexpr int N1 = 11;
 
-    V a("a",N0,N1);
-    M b = Kokkos::Experimental::create_mirror(a);
-    M c = Kokkos::Experimental::create_mirror_view(a);
-    M d ;
+    V a( "a", N0, N1 );
+    M b = Kokkos::Experimental::create_mirror( a );
+    M c = Kokkos::Experimental::create_mirror_view( a );
+    M d;
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    Kokkos::Experimental::deep_copy( a , b );
-    Kokkos::Experimental::deep_copy( c , a );
+    Kokkos::Experimental::deep_copy( a, b );
+    Kokkos::Experimental::deep_copy( c, a );
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+    }
 
-    Kokkos::Experimental::resize( b , 5 , 6 );
+    Kokkos::Experimental::resize( b, 5, 6 );
 
-    for ( int i0 = 0 ; i0 < 5 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 6 ; ++i1 ) {
+    for ( int i0 = 0; i0 < 5; ++i0 )
+    for ( int i1 = 0; i1 < 6; ++i1 )
+    {
       int val = 1 + i0 + i1 * N0;
-      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
-      ASSERT_EQ( b(i0,i1) , val );
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+      ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c , 5 , 6 );
-    Kokkos::Experimental::realloc( d , 5 , 6 );
+    Kokkos::Experimental::realloc( c, 5, 6 );
+    Kokkos::Experimental::realloc( d, 5, 6 );
 
-    ASSERT_EQ( b.dimension_0() , 5 );
-    ASSERT_EQ( b.dimension_1() , 6 );
-    ASSERT_EQ( c.dimension_0() , 5 );
-    ASSERT_EQ( c.dimension_1() , 6 );
-    ASSERT_EQ( d.dimension_0() , 5 );
-    ASSERT_EQ( d.dimension_1() , 6 );
+    ASSERT_EQ( b.dimension_0(), 5 );
+    ASSERT_EQ( b.dimension_1(), 6 );
+    ASSERT_EQ( c.dimension_0(), 5 );
+    ASSERT_EQ( c.dimension_1(), 6 );
+    ASSERT_EQ( d.dimension_0(), 5 );
+    ASSERT_EQ( d.dimension_1(), 6 );
 
-    layout_type layout(7,8);
-    Kokkos::Experimental::resize( b , layout );
-    for ( int i0 = 0 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 6 ; i1 < 8 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    layout_type layout( 7, 8 );
+    Kokkos::Experimental::resize( b, layout );
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 6; i1 < 8; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    for ( int i0 = 5 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 8 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    for ( int i0 = 5; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    for ( int i0 = 0 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 8 ; ++i1 ) {
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
        int val = 1 + i0 + i1 * N0;
-       ASSERT_EQ( b(i0,i1) , val );
+       ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c , layout );
-    Kokkos::Experimental::realloc( d , layout );
-
-    ASSERT_EQ( b.dimension_0() , 7 );
-    ASSERT_EQ( b.dimension_1() , 8 );
-    ASSERT_EQ( c.dimension_0() , 7 );
-    ASSERT_EQ( c.dimension_1() , 8 );
-    ASSERT_EQ( d.dimension_0() , 7 );
-    ASSERT_EQ( d.dimension_1() , 8 );
+    Kokkos::Experimental::realloc( c, layout );
+    Kokkos::Experimental::realloc( d, layout );
 
+    ASSERT_EQ( b.dimension_0(), 7 );
+    ASSERT_EQ( b.dimension_1(), 8 );
+    ASSERT_EQ( c.dimension_0(), 7 );
+    ASSERT_EQ( c.dimension_1(), 8 );
+    ASSERT_EQ( d.dimension_0(), 7 );
+    ASSERT_EQ( d.dimension_1(), 8 );
   }
 
   {
-    typedef Kokkos::View<int**,Kokkos::LayoutStride,Space>  V ;
-    typedef typename V::HostMirror  M ;
-    typedef typename Kokkos::View<int**,Kokkos::LayoutStride,Space>::array_layout layout_type;
+    typedef Kokkos::View< int**, Kokkos::LayoutStride, Space > V;
+    typedef typename V::HostMirror M;
+    typedef typename Kokkos::View< int**, Kokkos::LayoutStride, Space >::array_layout layout_type;
 
-    constexpr int N0 = 10 ;
-    constexpr int N1 = 11 ;
+    constexpr int N0 = 10;
+    constexpr int N1 = 11;
 
-    const int dimensions[] = {N0,N1};
-    const int order[] = {1,0};
+    const int dimensions[] = { N0, N1 };
+    const int order[] = { 1, 0 };
 
-    V a("a",Kokkos::LayoutStride::order_dimensions(2,order,dimensions));
-    M b = Kokkos::Experimental::create_mirror(a);
-    M c = Kokkos::Experimental::create_mirror_view(a);
-    M d ;
+    V a( "a", Kokkos::LayoutStride::order_dimensions( 2, order, dimensions ) );
+    M b = Kokkos::Experimental::create_mirror( a );
+    M c = Kokkos::Experimental::create_mirror_view( a );
+    M d;
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    Kokkos::Experimental::deep_copy( a , b );
-    Kokkos::Experimental::deep_copy( c , a );
+    Kokkos::Experimental::deep_copy( a, b );
+    Kokkos::Experimental::deep_copy( c, a );
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+    }
 
-    const int dimensions2[] = {7,8};
-    const int order2[] = {1,0};
-    layout_type layout = layout_type::order_dimensions(2,order2,dimensions2);
-    Kokkos::Experimental::resize( b , layout );
+    const int dimensions2[] = { 7, 8 };
+    const int order2[] = { 1, 0 };
+    layout_type layout = layout_type::order_dimensions( 2, order2, dimensions2 );
+    Kokkos::Experimental::resize( b, layout );
 
-    for ( int i0 = 0 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 8 ; ++i1 ) {
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
        int val = 1 + i0 + i1 * N0;
-       ASSERT_EQ( b(i0,i1) , c(i0,i1) );
-       ASSERT_EQ( b(i0,i1) , val );
+       ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+       ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c , layout );
-    Kokkos::Experimental::realloc( d , layout );
+    Kokkos::Experimental::realloc( c, layout );
+    Kokkos::Experimental::realloc( d, layout );
 
-    ASSERT_EQ( b.dimension_0() , 7 );
-    ASSERT_EQ( b.dimension_1() , 8 );
-    ASSERT_EQ( c.dimension_0() , 7 );
-    ASSERT_EQ( c.dimension_1() , 8 );
-    ASSERT_EQ( d.dimension_0() , 7 );
-    ASSERT_EQ( d.dimension_1() , 8 );
+    ASSERT_EQ( b.dimension_0(), 7 );
+    ASSERT_EQ( b.dimension_1(), 8 );
+    ASSERT_EQ( c.dimension_0(), 7 );
+    ASSERT_EQ( c.dimension_1(), 8 );
+    ASSERT_EQ( d.dimension_0(), 7 );
+    ASSERT_EQ( d.dimension_1(), 8 );
 
   }
 
   {
-    typedef Kokkos::View<int*,Space> V ;
-    typedef Kokkos::View<int*,Space,Kokkos::MemoryUnmanaged> U ;
+    typedef Kokkos::View< int*, Space > V;
+    typedef Kokkos::View< int*, Space, Kokkos::MemoryUnmanaged > U;
 
+    V a( "a", 10 );
 
-    V a("a",10);
+    ASSERT_EQ( a.use_count(), 1 );
 
-    ASSERT_EQ( a.use_count() , 1 );
+    V b = a;
 
-    V b = a ;
-
-    ASSERT_EQ( a.use_count() , 2 );
-    ASSERT_EQ( b.use_count() , 2 );
+    ASSERT_EQ( a.use_count(), 2 );
+    ASSERT_EQ( b.use_count(), 2 );
 
     {
-      U c = b ; // 'c' is compile-time unmanaged
+      U c = b; // 'c' is compile-time unmanaged.
 
-      ASSERT_EQ( a.use_count() , 2 );
-      ASSERT_EQ( b.use_count() , 2 );
-      ASSERT_EQ( c.use_count() , 2 );
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( b.use_count(), 2 );
+      ASSERT_EQ( c.use_count(), 2 );
 
-      V d = c ; // 'd' is run-time unmanaged
+      V d = c; // 'd' is run-time unmanaged.
 
-      ASSERT_EQ( a.use_count() , 2 );
-      ASSERT_EQ( b.use_count() , 2 );
-      ASSERT_EQ( c.use_count() , 2 );
-      ASSERT_EQ( d.use_count() , 2 );
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( b.use_count(), 2 );
+      ASSERT_EQ( c.use_count(), 2 );
+      ASSERT_EQ( d.use_count(), 2 );
     }
 
-    ASSERT_EQ( a.use_count() , 2 );
-    ASSERT_EQ( b.use_count() , 2 );
+    ASSERT_EQ( a.use_count(), 2 );
+    ASSERT_EQ( b.use_count(), 2 );
 
     b = V();
 
-    ASSERT_EQ( a.use_count() , 1 );
-    ASSERT_EQ( b.use_count() , 0 );
-
-#if ! defined ( KOKKOS_ENABLE_CUDA_LAMBDA )
-    /* Cannot launch host lambda when CUDA lambda is enabled */
-
-    typedef typename Kokkos::Impl::HostMirror< Space >::Space::execution_space
-      host_exec_space ;
-
-    Kokkos::parallel_for(
-      Kokkos::RangePolicy< host_exec_space >(0,10) ,
-      KOKKOS_LAMBDA( int i ){
-        // 'a' is captured by copy and the capture mechanism
-        // converts 'a' to an unmanaged copy.
-        // When the parallel dispatch accepts a move for the lambda
-        // this count should become 1
-        ASSERT_EQ( a.use_count() , 2 );
-        V x = a ;
-        ASSERT_EQ( a.use_count() , 2 );
-        ASSERT_EQ( x.use_count() , 2 );
-      });
-#endif /* #if ! defined ( KOKKOS_ENABLE_CUDA_LAMBDA ) */
+    ASSERT_EQ( a.use_count(), 1 );
+    ASSERT_EQ( b.use_count(), 0 );
+
+#if !defined( KOKKOS_ENABLE_CUDA_LAMBDA )
+    // Cannot launch host lambda when CUDA lambda is enabled.
+
+    typedef typename Kokkos::Impl::HostMirror< Space >::Space::execution_space host_exec_space;
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< host_exec_space >( 0, 10 ), KOKKOS_LAMBDA ( int i ) {
+      // 'a' is captured by copy, and the capture mechanism converts 'a' to an
+      // unmanaged copy.  When the parallel dispatch accepts a move for the
+      // lambda, this count should become 1.
+      ASSERT_EQ( a.use_count(), 2 );
+      V x = a;
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( x.use_count(), 2 );
+    });
+#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA )
   }
 }
 
 template< class Space >
 struct TestViewMappingSubview
 {
-  typedef typename Space::execution_space ExecSpace ;
-  typedef typename Space::memory_space    MemSpace ;
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
 
-  typedef Kokkos::pair<int,int> range ;
+  typedef Kokkos::pair< int, int > range;
 
   enum { AN = 10 };
-  typedef Kokkos::View<int*,ExecSpace>  AT ;
-  typedef Kokkos::View<const int*,ExecSpace>  ACT ;
-  typedef Kokkos::Subview< AT , range >  AS ;
+  typedef Kokkos::View< int*, ExecSpace >  AT;
+  typedef Kokkos::View< const int*, ExecSpace >  ACT;
+  typedef Kokkos::Subview< AT, range >  AS;
 
-  enum { BN0 = 10 , BN1 = 11 , BN2 = 12 };
-  typedef Kokkos::View<int***,ExecSpace>  BT ;
-  typedef Kokkos::Subview< BT , range , range , range >  BS ;
+  enum { BN0 = 10, BN1 = 11, BN2 = 12 };
+  typedef Kokkos::View< int***, ExecSpace >  BT;
+  typedef Kokkos::Subview< BT, range, range, range >  BS;
 
-  enum { CN0 = 10 , CN1 = 11 , CN2 = 12 };
-  typedef Kokkos::View<int***[13][14],ExecSpace>  CT ;
-  typedef Kokkos::Subview< CT , range , range , range , int , int >  CS ;
+  enum { CN0 = 10, CN1 = 11, CN2 = 12 };
+  typedef Kokkos::View< int***[13][14], ExecSpace >  CT;
+  typedef Kokkos::Subview< CT, range, range, range, int, int >  CS;
 
-  enum { DN0 = 10 , DN1 = 11 , DN2 = 12 , DN3 = 13 , DN4 = 14 };
-  typedef Kokkos::View<int***[DN3][DN4],ExecSpace>  DT ;
-  typedef Kokkos::Subview< DT , int , range , range , range , int >  DS ;
+  enum { DN0 = 10, DN1 = 11, DN2 = 12, DN3 = 13, DN4 = 14 };
+  typedef Kokkos::View< int***[DN3][DN4], ExecSpace >  DT;
+  typedef Kokkos::Subview< DT, int, range, range, range, int >  DS;
 
+  typedef Kokkos::View< int***[13][14], Kokkos::LayoutLeft, ExecSpace >  DLT;
+  typedef Kokkos::Subview< DLT, range, int, int, int, int >  DLS1;
 
-  typedef Kokkos::View<int***[13][14],Kokkos::LayoutLeft,ExecSpace>  DLT ;
-  typedef Kokkos::Subview< DLT , range , int , int , int , int >  DLS1 ;
-
-  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout , Kokkos::LayoutLeft >::value
+  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout, Kokkos::LayoutLeft >::value
                , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" );
 
-  typedef Kokkos::View<int***[13][14],Kokkos::LayoutRight,ExecSpace>  DRT ;
-  typedef Kokkos::Subview< DRT , int , int , int , int , range >  DRS1 ;
+  typedef Kokkos::View< int***[13][14], Kokkos::LayoutRight, ExecSpace >  DRT;
+  typedef Kokkos::Subview< DRT, int, int, int, int, range >  DRS1;
 
-  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout , Kokkos::LayoutRight >::value
+  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout, Kokkos::LayoutRight >::value
                , "Subview layout error for rank 1 subview of right-most range of LayoutRight" );
 
-  AT Aa ;
-  AS Ab ;
-  ACT Ac ;
-  BT Ba ;
-  BS Bb ;
-  CT Ca ;
-  CS Cb ;
-  DT Da ;
-  DS Db ;
+  AT Aa;
+  AS Ab;
+  ACT Ac;
+  BT Ba;
+  BS Bb;
+  CT Ca;
+  CS Cb;
+  DT Da;
+  DS Db;
 
   TestViewMappingSubview()
-    : Aa("Aa",AN)
-    , Ab( Kokkos::Experimental::subview( Aa , std::pair<int,int>(1,AN-1) ) )
-    , Ac( Aa , std::pair<int,int>(1,AN-1) )
-    , Ba("Ba",BN0,BN1,BN2)
+    : Aa( "Aa", AN )
+    , Ab( Kokkos::Experimental::subview( Aa, std::pair< int, int >( 1, AN - 1 ) ) )
+    , Ac( Aa, std::pair< int, int >( 1, AN - 1 ) )
+    , Ba( "Ba", BN0, BN1, BN2 )
     , Bb( Kokkos::Experimental::subview( Ba
-                                        , std::pair<int,int>(1,BN0-1)
-                                        , std::pair<int,int>(1,BN1-1)
-                                        , std::pair<int,int>(1,BN2-1)
+                                        , std::pair< int, int >( 1, BN0 - 1 )
+                                        , std::pair< int, int >( 1, BN1 - 1 )
+                                        , std::pair< int, int >( 1, BN2 - 1 )
                                         ) )
-    , Ca("Ca",CN0,CN1,CN2)
+    , Ca( "Ca", CN0, CN1, CN2 )
     , Cb( Kokkos::Experimental::subview( Ca
-                                        , std::pair<int,int>(1,CN0-1)
-                                        , std::pair<int,int>(1,CN1-1)
-                                        , std::pair<int,int>(1,CN2-1)
+                                        , std::pair< int, int >( 1, CN0 - 1 )
+                                        , std::pair< int, int >( 1, CN1 - 1 )
+                                        , std::pair< int, int >( 1, CN2 - 1 )
                                         , 1
                                         , 2
                                         ) )
-    , Da("Da",DN0,DN1,DN2)
+    , Da( "Da", DN0, DN1, DN2 )
     , Db( Kokkos::Experimental::subview( Da
                                         , 1
-                                        , std::pair<int,int>(1,DN1-1)
-                                        , std::pair<int,int>(1,DN2-1)
-                                        , std::pair<int,int>(1,DN3-1)
+                                        , std::pair< int, int >( 1, DN1 - 1 )
+                                        , std::pair< int, int >( 1, DN2 - 1 )
+                                        , std::pair< int, int >( 1, DN3 - 1 )
                                         , 2
                                         ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int, long & error_count ) const
+  {
+    auto Ad = Kokkos::Experimental::subview< Kokkos::MemoryUnmanaged >( Aa, Kokkos::pair< int, int >( 1, AN - 1 ) );
+
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ab[i - 1] ) ++error_count;
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ac[i - 1] ) ++error_count;
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ad[i - 1] ) ++error_count;
+
+    for ( int i2 = 1; i2 < BN2 - 1; ++i2 )
+    for ( int i1 = 1; i1 < BN1 - 1; ++i1 )
+    for ( int i0 = 1; i0 < BN0 - 1; ++i0 )
     {
+      if ( & Ba( i0, i1, i2 ) != & Bb( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
     }
 
+    for ( int i2 = 1; i2 < CN2 - 1; ++i2 )
+    for ( int i1 = 1; i1 < CN1 - 1; ++i1 )
+    for ( int i0 = 1; i0 < CN0 - 1; ++i0 )
+    {
+      if ( & Ca( i0, i1, i2, 1, 2 ) != & Cb( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
+    }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int , long & error_count ) const
+    for ( int i2 = 1; i2 < DN3 - 1; ++i2 )
+    for ( int i1 = 1; i1 < DN2 - 1; ++i1 )
+    for ( int i0 = 1; i0 < DN1 - 1; ++i0 )
     {
-      auto Ad = Kokkos::Experimental::subview< Kokkos::MemoryUnmanaged >( Aa , Kokkos::pair<int,int>(1,AN-1) );
-
-      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ab[i-1] ) ++error_count ;
-      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ac[i-1] ) ++error_count ;
-      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ad[i-1] ) ++error_count ;
-
-      for ( int i2 = 1 ; i2 < BN2-1 ; ++i2 ) {
-      for ( int i1 = 1 ; i1 < BN1-1 ; ++i1 ) {
-      for ( int i0 = 1 ; i0 < BN0-1 ; ++i0 ) {
-        if ( & Ba(i0,i1,i2) != & Bb(i0-1,i1-1,i2-1) ) ++error_count ;
-      }}}
-
-      for ( int i2 = 1 ; i2 < CN2-1 ; ++i2 ) {
-      for ( int i1 = 1 ; i1 < CN1-1 ; ++i1 ) {
-      for ( int i0 = 1 ; i0 < CN0-1 ; ++i0 ) {
-        if ( & Ca(i0,i1,i2,1,2) != & Cb(i0-1,i1-1,i2-1) ) ++error_count ;
-      }}}
-
-      for ( int i2 = 1 ; i2 < DN3-1 ; ++i2 ) {
-      for ( int i1 = 1 ; i1 < DN2-1 ; ++i1 ) {
-      for ( int i0 = 1 ; i0 < DN1-1 ; ++i0 ) {
-        if ( & Da(1,i0,i1,i2,2) != & Db(i0-1,i1-1,i2-1) ) ++error_count ;
-      }}}
+      if ( & Da( 1, i0, i1, i2, 2 ) != & Db( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
     }
+  }
 
   static void run()
   {
-    TestViewMappingSubview self ;
-
-    ASSERT_EQ( self.Aa.dimension_0() , AN );
-    ASSERT_EQ( self.Ab.dimension_0() , AN - 2 );
-    ASSERT_EQ( self.Ac.dimension_0() , AN - 2 );
-    ASSERT_EQ( self.Ba.dimension_0() , BN0 );
-    ASSERT_EQ( self.Ba.dimension_1() , BN1 );
-    ASSERT_EQ( self.Ba.dimension_2() , BN2 );
-    ASSERT_EQ( self.Bb.dimension_0() , BN0 - 2 );
-    ASSERT_EQ( self.Bb.dimension_1() , BN1 - 2 );
-    ASSERT_EQ( self.Bb.dimension_2() , BN2 - 2 );
-
-    ASSERT_EQ( self.Ca.dimension_0() , CN0 );
-    ASSERT_EQ( self.Ca.dimension_1() , CN1 );
-    ASSERT_EQ( self.Ca.dimension_2() , CN2 );
-    ASSERT_EQ( self.Ca.dimension_3() , 13 );
-    ASSERT_EQ( self.Ca.dimension_4() , 14 );
-    ASSERT_EQ( self.Cb.dimension_0() , CN0 - 2 );
-    ASSERT_EQ( self.Cb.dimension_1() , CN1 - 2 );
-    ASSERT_EQ( self.Cb.dimension_2() , CN2 - 2 );
-
-    ASSERT_EQ( self.Da.dimension_0() , DN0 );
-    ASSERT_EQ( self.Da.dimension_1() , DN1 );
-    ASSERT_EQ( self.Da.dimension_2() , DN2 );
-    ASSERT_EQ( self.Da.dimension_3() , DN3 );
-    ASSERT_EQ( self.Da.dimension_4() , DN4 );
-
-    ASSERT_EQ( self.Db.dimension_0() , DN1 - 2 );
-    ASSERT_EQ( self.Db.dimension_1() , DN2 - 2 );
-    ASSERT_EQ( self.Db.dimension_2() , DN3 - 2 );
-
-    ASSERT_EQ( self.Da.stride_1() , self.Db.stride_0() );
-    ASSERT_EQ( self.Da.stride_2() , self.Db.stride_1() );
-    ASSERT_EQ( self.Da.stride_3() , self.Db.stride_2() );
-
-    long error_count = -1 ;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >(0,1) , self , error_count );
-    ASSERT_EQ( error_count , 0 );
+    TestViewMappingSubview self;
+
+    ASSERT_EQ( self.Aa.dimension_0(), AN );
+    ASSERT_EQ( self.Ab.dimension_0(), AN - 2 );
+    ASSERT_EQ( self.Ac.dimension_0(), AN - 2 );
+    ASSERT_EQ( self.Ba.dimension_0(), BN0 );
+    ASSERT_EQ( self.Ba.dimension_1(), BN1 );
+    ASSERT_EQ( self.Ba.dimension_2(), BN2 );
+    ASSERT_EQ( self.Bb.dimension_0(), BN0 - 2 );
+    ASSERT_EQ( self.Bb.dimension_1(), BN1 - 2 );
+    ASSERT_EQ( self.Bb.dimension_2(), BN2 - 2 );
+
+    ASSERT_EQ( self.Ca.dimension_0(), CN0 );
+    ASSERT_EQ( self.Ca.dimension_1(), CN1 );
+    ASSERT_EQ( self.Ca.dimension_2(), CN2 );
+    ASSERT_EQ( self.Ca.dimension_3(), 13 );
+    ASSERT_EQ( self.Ca.dimension_4(), 14 );
+    ASSERT_EQ( self.Cb.dimension_0(), CN0 - 2 );
+    ASSERT_EQ( self.Cb.dimension_1(), CN1 - 2 );
+    ASSERT_EQ( self.Cb.dimension_2(), CN2 - 2 );
+
+    ASSERT_EQ( self.Da.dimension_0(), DN0 );
+    ASSERT_EQ( self.Da.dimension_1(), DN1 );
+    ASSERT_EQ( self.Da.dimension_2(), DN2 );
+    ASSERT_EQ( self.Da.dimension_3(), DN3 );
+    ASSERT_EQ( self.Da.dimension_4(), DN4 );
+
+    ASSERT_EQ( self.Db.dimension_0(), DN1 - 2 );
+    ASSERT_EQ( self.Db.dimension_1(), DN2 - 2 );
+    ASSERT_EQ( self.Db.dimension_2(), DN3 - 2 );
+
+    ASSERT_EQ( self.Da.stride_1(), self.Db.stride_0() );
+    ASSERT_EQ( self.Da.stride_2(), self.Db.stride_1() );
+    ASSERT_EQ( self.Da.stride_3(), self.Db.stride_2() );
+
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, 1 ), self, error_count );
+    ASSERT_EQ( error_count, 0 );
   }
-
 };
 
 template< class Space >
 void test_view_mapping_subview()
 {
-  typedef typename Space::execution_space ExecSpace ;
+  typedef typename Space::execution_space ExecSpace;
 
   TestViewMappingSubview< ExecSpace >::run();
 }
@@ -1181,214 +1195,228 @@ struct TestViewMapOperator {
   static_assert( ViewType::reference_type_is_lvalue_reference
                , "Test only valid for lvalue reference type" );
 
-  const ViewType v ;
+  const ViewType v;
 
   KOKKOS_INLINE_FUNCTION
-  void test_left( size_t i0 , long & error_count ) const
+  void test_left( size_t i0, long & error_count ) const
+  {
+    typename ViewType::value_type * const base_ptr = & v( 0, 0, 0, 0, 0, 0, 0, 0 );
+    const size_t n1 = v.dimension_1();
+    const size_t n2 = v.dimension_2();
+    const size_t n3 = v.dimension_3();
+    const size_t n4 = v.dimension_4();
+    const size_t n5 = v.dimension_5();
+    const size_t n6 = v.dimension_6();
+    const size_t n7 = v.dimension_7();
+
+    long offset = 0;
+
+    for ( size_t i7 = 0; i7 < n7; ++i7 )
+    for ( size_t i6 = 0; i6 < n6; ++i6 )
+    for ( size_t i5 = 0; i5 < n5; ++i5 )
+    for ( size_t i4 = 0; i4 < n4; ++i4 )
+    for ( size_t i3 = 0; i3 < n3; ++i3 )
+    for ( size_t i2 = 0; i2 < n2; ++i2 )
+    for ( size_t i1 = 0; i1 < n1; ++i1 )
     {
-      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
-      const size_t n1 = v.dimension_1();
-      const size_t n2 = v.dimension_2();
-      const size_t n3 = v.dimension_3();
-      const size_t n4 = v.dimension_4();
-      const size_t n5 = v.dimension_5();
-      const size_t n6 = v.dimension_6();
-      const size_t n7 = v.dimension_7();
-
-      long offset = 0 ;
-
-      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
-      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
-      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
-      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
-      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
-      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
-      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
-      {
-        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
-        if ( d < offset ) ++error_count ;
-        offset = d ;
-      }
-
-      if ( v.span() <= size_t(offset) ) ++error_count ;
+      const long d = & v( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+      if ( d < offset ) ++error_count;
+      offset = d;
     }
 
+    if ( v.span() <= size_t( offset ) ) ++error_count;
+  }
+
   KOKKOS_INLINE_FUNCTION
-  void test_right( size_t i0 , long & error_count ) const
+  void test_right( size_t i0, long & error_count ) const
+  {
+    typename ViewType::value_type * const base_ptr = & v( 0, 0, 0, 0, 0, 0, 0, 0 );
+    const size_t n1 = v.dimension_1();
+    const size_t n2 = v.dimension_2();
+    const size_t n3 = v.dimension_3();
+    const size_t n4 = v.dimension_4();
+    const size_t n5 = v.dimension_5();
+    const size_t n6 = v.dimension_6();
+    const size_t n7 = v.dimension_7();
+
+    long offset = 0;
+
+    for ( size_t i1 = 0; i1 < n1; ++i1 )
+    for ( size_t i2 = 0; i2 < n2; ++i2 )
+    for ( size_t i3 = 0; i3 < n3; ++i3 )
+    for ( size_t i4 = 0; i4 < n4; ++i4 )
+    for ( size_t i5 = 0; i5 < n5; ++i5 )
+    for ( size_t i6 = 0; i6 < n6; ++i6 )
+    for ( size_t i7 = 0; i7 < n7; ++i7 )
     {
-      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
-      const size_t n1 = v.dimension_1();
-      const size_t n2 = v.dimension_2();
-      const size_t n3 = v.dimension_3();
-      const size_t n4 = v.dimension_4();
-      const size_t n5 = v.dimension_5();
-      const size_t n6 = v.dimension_6();
-      const size_t n7 = v.dimension_7();
-
-      long offset = 0 ;
-
-      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
-      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
-      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
-      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
-      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
-      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
-      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
-      {
-        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
-        if ( d < offset ) ++error_count ;
-        offset = d ;
-      }
-
-      if ( v.span() <= size_t(offset) ) ++error_count ;
+      const long d = & v( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+      if ( d < offset ) ++error_count;
+      offset = d;
     }
 
+    if ( v.span() <= size_t( offset ) ) ++error_count;
+  }
+
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_t i , long & error_count ) const
-    {
-      if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutLeft >::value )
-        test_left(i,error_count);
-      else if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutRight >::value )
-        test_right(i,error_count);
+  void operator()( size_t i, long & error_count ) const
+  {
+    if ( std::is_same< typename ViewType::array_layout, Kokkos::LayoutLeft >::value ) {
+      test_left( i, error_count );
     }
+    else if ( std::is_same< typename ViewType::array_layout, Kokkos::LayoutRight >::value ) {
+      test_right( i, error_count );
+    }
+  }
 
-  constexpr static size_t N0 = 10 ;
-  constexpr static size_t N1 =  9 ;
-  constexpr static size_t N2 =  8 ;
-  constexpr static size_t N3 =  7 ;
-  constexpr static size_t N4 =  6 ;
-  constexpr static size_t N5 =  5 ;
-  constexpr static size_t N6 =  4 ;
-  constexpr static size_t N7 =  3 ;
+  constexpr static size_t N0 = 10;
+  constexpr static size_t N1 =  9;
+  constexpr static size_t N2 =  8;
+  constexpr static size_t N3 =  7;
+  constexpr static size_t N4 =  6;
+  constexpr static size_t N5 =  5;
+  constexpr static size_t N6 =  4;
+  constexpr static size_t N7 =  3;
 
-  TestViewMapOperator() : v( "Test" , N0, N1, N2, N3, N4, N5, N6, N7 ) {}
+  TestViewMapOperator() : v( "Test", N0, N1, N2, N3, N4, N5, N6, N7 ) {}
 
   static void run()
-    {
-      TestViewMapOperator self ;
-
-      ASSERT_EQ( self.v.dimension_0() , ( 0 < ViewType::rank ? N0 : 1 ) );
-      ASSERT_EQ( self.v.dimension_1() , ( 1 < ViewType::rank ? N1 : 1 ) );
-      ASSERT_EQ( self.v.dimension_2() , ( 2 < ViewType::rank ? N2 : 1 ) );
-      ASSERT_EQ( self.v.dimension_3() , ( 3 < ViewType::rank ? N3 : 1 ) );
-      ASSERT_EQ( self.v.dimension_4() , ( 4 < ViewType::rank ? N4 : 1 ) );
-      ASSERT_EQ( self.v.dimension_5() , ( 5 < ViewType::rank ? N5 : 1 ) );
-      ASSERT_EQ( self.v.dimension_6() , ( 6 < ViewType::rank ? N6 : 1 ) );
-      ASSERT_EQ( self.v.dimension_7() , ( 7 < ViewType::rank ? N7 : 1 ) );
-
-      ASSERT_LE( self.v.dimension_0()*
-                 self.v.dimension_1()*
-                 self.v.dimension_2()*
-                 self.v.dimension_3()*
-                 self.v.dimension_4()*
-                 self.v.dimension_5()*
-                 self.v.dimension_6()*
-                 self.v.dimension_7()
-               , self.v.span() );
-
-      long error_count ;
-      Kokkos::RangePolicy< typename ViewType::execution_space > range(0,self.v.dimension_0());
-      Kokkos::parallel_reduce( range , self , error_count );
-      ASSERT_EQ( 0 , error_count );
-    }
+  {
+    TestViewMapOperator self;
+
+    ASSERT_EQ( self.v.dimension_0(), ( 0 < ViewType::rank ? N0 : 1 ) );
+    ASSERT_EQ( self.v.dimension_1(), ( 1 < ViewType::rank ? N1 : 1 ) );
+    ASSERT_EQ( self.v.dimension_2(), ( 2 < ViewType::rank ? N2 : 1 ) );
+    ASSERT_EQ( self.v.dimension_3(), ( 3 < ViewType::rank ? N3 : 1 ) );
+    ASSERT_EQ( self.v.dimension_4(), ( 4 < ViewType::rank ? N4 : 1 ) );
+    ASSERT_EQ( self.v.dimension_5(), ( 5 < ViewType::rank ? N5 : 1 ) );
+    ASSERT_EQ( self.v.dimension_6(), ( 6 < ViewType::rank ? N6 : 1 ) );
+    ASSERT_EQ( self.v.dimension_7(), ( 7 < ViewType::rank ? N7 : 1 ) );
+
+    ASSERT_LE( self.v.dimension_0() *
+               self.v.dimension_1() *
+               self.v.dimension_2() *
+               self.v.dimension_3() *
+               self.v.dimension_4() *
+               self.v.dimension_5() *
+               self.v.dimension_6() *
+               self.v.dimension_7()
+             , self.v.span() );
+
+    long error_count;
+    Kokkos::RangePolicy< typename ViewType::execution_space > range( 0, self.v.dimension_0() );
+    Kokkos::parallel_reduce( range, self, error_count );
+    ASSERT_EQ( 0, error_count );
+  }
 };
 
-
 template< class Space >
 void test_view_mapping_operator()
 {
-  typedef typename Space::execution_space ExecSpace ;
-
-  TestViewMapOperator< Kokkos::View<int,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int**,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int***,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int****,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*****,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int******,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*******,Kokkos::LayoutLeft,ExecSpace> >::run();
-
-  TestViewMapOperator< Kokkos::View<int,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int**,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int***,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int****,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*****,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int******,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*******,Kokkos::LayoutRight,ExecSpace> >::run();
+  typedef typename Space::execution_space ExecSpace;
+
+  TestViewMapOperator< Kokkos::View<int, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int**, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int***, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int****, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*****, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int******, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*******, Kokkos::LayoutLeft, ExecSpace> >::run();
+
+  TestViewMapOperator< Kokkos::View<int, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int***, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int****, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*****, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int******, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*******, Kokkos::LayoutRight, ExecSpace> >::run();
 }
 
 /*--------------------------------------------------------------------------*/
 
 template< class Space >
 struct TestViewMappingAtomic {
-  typedef typename Space::execution_space ExecSpace ;
-  typedef typename Space::memory_space    MemSpace ;
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
 
-  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait ;
+  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait;
 
-  typedef Kokkos::View< int * , ExecSpace > T ;
-  typedef Kokkos::View< int * , ExecSpace , mem_trait >  T_atom ;
+  typedef Kokkos::View< int *, ExecSpace > T;
+  typedef Kokkos::View< int *, ExecSpace, mem_trait >  T_atom;
 
-  T      x ;
-  T_atom x_atom ;
+  T      x;
+  T_atom x_atom;
 
-  constexpr static size_t N = 100000 ;
+  constexpr static size_t N = 100000;
 
   struct TagInit {};
   struct TagUpdate {};
   struct TagVerify {};
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const
-    { x(i) = i ; }
+  void operator()( const TagInit &, const int i ) const
+  { x( i ) = i; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagUpdate & , const int i ) const
-    { x_atom(i%2) += 1 ; }
+  void operator()( const TagUpdate &, const int i ) const
+  { x_atom( i % 2 ) += 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagVerify & , const int i , long & error_count ) const
-    {
-       if ( i < 2 ) { if ( x(i) != int(i + N / 2) ) ++error_count ; }
-       else         { if ( x(i) != int(i) ) ++error_count ; }
-    }
+  void operator()( const TagVerify &, const int i, long & error_count ) const
+  {
+     if ( i < 2 ) { if ( x( i ) != int( i + N / 2 ) ) ++error_count; }
+     else         { if ( x( i ) != int( i ) ) ++error_count; }
+  }
 
   TestViewMappingAtomic()
-    : x("x",N)
+    : x( "x", N )
     , x_atom( x )
     {}
 
   static void run()
+  {
+    ASSERT_TRUE( T::reference_type_is_lvalue_reference );
+    ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
+
+    TestViewMappingAtomic self;
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, TagInit >( 0, N ), self );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, TagUpdate >( 0, N ), self );
+
+    long error_count = -1;
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagVerify >( 0, N ), self, error_count );
+
+    ASSERT_EQ( 0, error_count );
+
+    typename TestViewMappingAtomic::T_atom::HostMirror x_host = Kokkos::create_mirror_view( self.x );
+    Kokkos::deep_copy( x_host, self.x );
+
+    error_count = -1;
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::DefaultHostExecutionSpace, TagVerify >( 0, N ), 
+      [=] ( const TagVerify &, const int i, long & tmp_error_count )
     {
-      ASSERT_TRUE( T::reference_type_is_lvalue_reference );
-      ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
-
-      TestViewMappingAtomic self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagInit >(0,N) , self );
-      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagUpdate >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagVerify >(0,N) , self , error_count );
-      ASSERT_EQ( 0 , error_count );
-      typename TestViewMappingAtomic::T_atom::HostMirror x_host = Kokkos::create_mirror_view(self.x);
-      Kokkos::deep_copy(x_host,self.x);
-      error_count = -1;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::DefaultHostExecutionSpace, TagVerify>(0,N), 
-        [=] ( const TagVerify & , const int i , long & tmp_error_count ) {
-        if ( i < 2 ) { if ( x_host(i) != int(i + N / 2) ) ++tmp_error_count ; }
-        else         { if ( x_host(i) != int(i) ) ++tmp_error_count ; }
-      }, error_count);
-      ASSERT_EQ( 0 , error_count );
-      Kokkos::deep_copy(self.x,x_host);
-    }
+      if ( i < 2 ) {
+        if ( x_host( i ) != int( i + N / 2 ) ) ++tmp_error_count ;
+      }
+      else {
+        if ( x_host( i ) != int( i ) ) ++tmp_error_count ;
+      }
+    }, error_count);
+
+    ASSERT_EQ( 0 , error_count );
+    Kokkos::deep_copy( self.x, x_host );
+  }
 };
 
 /*--------------------------------------------------------------------------*/
 
 template< class Space >
 struct TestViewMappingClassValue {
-  typedef typename Space::execution_space ExecSpace ;
-  typedef typename Space::memory_space    MemSpace ;
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
 
   struct ValueType {
     KOKKOS_INLINE_FUNCTION
@@ -1396,11 +1424,11 @@ struct TestViewMappingClassValue {
     {
 #if 0
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-      printf("TestViewMappingClassValue construct on Cuda\n");
+      printf( "TestViewMappingClassValue construct on Cuda\n" );
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      printf("TestViewMappingClassValue construct on Host\n");
+      printf( "TestViewMappingClassValue construct on Host\n" );
 #else
-      printf("TestViewMappingClassValue construct unknown\n");
+      printf( "TestViewMappingClassValue construct unknown\n" );
 #endif
 #endif
     }
@@ -1409,11 +1437,11 @@ struct TestViewMappingClassValue {
     {
 #if 0
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-      printf("TestViewMappingClassValue destruct on Cuda\n");
+      printf( "TestViewMappingClassValue destruct on Cuda\n" );
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      printf("TestViewMappingClassValue destruct on Host\n");
+      printf( "TestViewMappingClassValue destruct on Host\n" );
 #else
-      printf("TestViewMappingClassValue destruct unknown\n");
+      printf( "TestViewMappingClassValue destruct unknown\n" );
 #endif
 #endif
     }
@@ -1421,17 +1449,15 @@ struct TestViewMappingClassValue {
 
   static void run()
   {
-    using namespace Kokkos::Experimental ;
+    using namespace Kokkos::Experimental;
+
     ExecSpace::fence();
     {
-      View< ValueType , ExecSpace > a("a");
+      View< ValueType, ExecSpace > a( "a" );
       ExecSpace::fence();
     }
     ExecSpace::fence();
   }
 };
 
-} /* namespace Test */
-
-/*--------------------------------------------------------------------------*/
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewOfClass.hpp b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
index 381b8786bc740dfcfb922eb6ddf5443ffa7136cd..d624c5dda2034b04b5b1a427614f38186aa032d8 100644
--- a/lib/kokkos/core/unit_test/TestViewOfClass.hpp
+++ b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,34 +48,29 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
 template< class Space >
 struct NestedView {
-
-  Kokkos::View<int*,Space> member ;
+  Kokkos::View< int*, Space > member;
 
 public:
-
   KOKKOS_INLINE_FUNCTION
-  NestedView() : member()
-    {}
+  NestedView() : member() {}
 
   KOKKOS_INLINE_FUNCTION
-  NestedView & operator = ( const Kokkos::View<int*,Space> & lhs )
-    {
-      member = lhs ;
-      if ( member.dimension_0() ) Kokkos::atomic_add( & member(0) , 1 );
-      return *this ;
-    }
+  NestedView & operator=( const Kokkos::View< int*, Space > & lhs )
+  {
+    member = lhs;
+    if ( member.dimension_0() ) Kokkos::atomic_add( & member( 0 ), 1 );
+    return *this;
+  }
 
   KOKKOS_INLINE_FUNCTION
   ~NestedView()
-  { 
+  {
     if ( member.dimension_0() ) {
-      Kokkos::atomic_add( & member(0) , -1 );
+      Kokkos::atomic_add( & member( 0 ), -1 );
     }
   }
 };
@@ -83,49 +78,44 @@ public:
 template< class Space >
 struct NestedViewFunctor {
 
-  Kokkos::View< NestedView<Space> * , Space > nested ;
-  Kokkos::View<int*,Space>                    array ;
+  Kokkos::View< NestedView<Space> *, Space > nested;
+  Kokkos::View< int*, Space >                array;
 
-  NestedViewFunctor( 
-    const Kokkos::View< NestedView<Space> * , Space > & arg_nested ,
-    const Kokkos::View<int*,Space>                    & arg_array )
+  NestedViewFunctor(
+    const Kokkos::View< NestedView<Space> *, Space > & arg_nested,
+    const Kokkos::View< int*, Space >                & arg_array )
   : nested( arg_nested )
   , array(  arg_array )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( int i ) const
-    { nested[i] = array ; }
+  void operator()( int i ) const { nested[i] = array; }
 };
 
-
 template< class Space >
 void view_nested_view()
 {
-  Kokkos::View<int*,Space> tracking("tracking",1);
+  Kokkos::View< int*, Space > tracking( "tracking", 1 );
 
-  typename Kokkos::View<int*,Space>::HostMirror
-     host_tracking = Kokkos::create_mirror( tracking );
+  typename Kokkos::View< int*, Space >::HostMirror host_tracking = Kokkos::create_mirror( tracking );
 
   {
-    Kokkos::View< NestedView<Space> * , Space > a("a_nested_view",2);
+    Kokkos::View< NestedView<Space> *, Space > a( "a_nested_view", 2 );
 
-    Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,2) , NestedViewFunctor<Space>( a , tracking ) );
-    Kokkos::deep_copy( host_tracking , tracking );
-    ASSERT_EQ( 2 , host_tracking(0) );
+    Kokkos::parallel_for( Kokkos::RangePolicy< Space >( 0, 2 ), NestedViewFunctor< Space >( a, tracking ) );
+    Kokkos::deep_copy( host_tracking, tracking );
+    ASSERT_EQ( 2, host_tracking( 0 ) );
 
-    Kokkos::View< NestedView<Space> * , Space > b("b_nested_view",2);
-    Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,2) , NestedViewFunctor<Space>( b , tracking ) );
-    Kokkos::deep_copy( host_tracking , tracking );
-    ASSERT_EQ( 4 , host_tracking(0) );
+    Kokkos::View< NestedView<Space> *, Space > b( "b_nested_view", 2 );
+    Kokkos::parallel_for( Kokkos::RangePolicy< Space >( 0, 2 ), NestedViewFunctor< Space >( b, tracking ) );
+    Kokkos::deep_copy( host_tracking, tracking );
+    ASSERT_EQ( 4, host_tracking( 0 ) );
 
   }
-  Kokkos::deep_copy( host_tracking , tracking );
 
-  ASSERT_EQ( 0 , host_tracking(0) );
-}
+  Kokkos::deep_copy( host_tracking, tracking );
 
+  ASSERT_EQ( 0, host_tracking( 0 ) );
 }
 
-/*--------------------------------------------------------------------------*/
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp b/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp
index 09141e582c48423341029bae51c09fe51d14c893..21ae92e93ccdc09c3e42057f706c7bec383239eb 100644
--- a/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,35 +48,29 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
-template< typename SpaceDst , typename SpaceSrc >
+template< typename SpaceDst, typename SpaceSrc >
 void view_space_assign()
 {
-  Kokkos::View<double*,SpaceDst> a =
-  Kokkos::View<double*,SpaceSrc>("a",1);
+  Kokkos::View< double*, SpaceDst > a =
+    Kokkos::View< double*, SpaceSrc >( "a", 1 );
 
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceDst> b =
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceSrc>("b",1);
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceDst > b =
+    Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "b", 1 );
 
-  Kokkos::View<double*,Kokkos::LayoutRight,SpaceDst> c =
-  Kokkos::View<double*,Kokkos::LayoutRight,SpaceSrc>("c",1);
+  Kokkos::View< double*, Kokkos::LayoutRight, SpaceDst > c =
+    Kokkos::View< double*, Kokkos::LayoutRight, SpaceSrc >( "c", 1 );
 
-  Kokkos::View<double*,SpaceDst,Kokkos::MemoryRandomAccess> d =
-  Kokkos::View<double*,SpaceSrc>("d",1);
+  Kokkos::View< double*, SpaceDst, Kokkos::MemoryRandomAccess > d =
+    Kokkos::View< double*, SpaceSrc >( "d", 1 );
 
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceDst,Kokkos::MemoryRandomAccess> e =
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceSrc>("e",1);
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceDst, Kokkos::MemoryRandomAccess > e =
+    Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "e", 1 );
 
   // Rank-one layout can assign:
-  Kokkos::View<double*,Kokkos::LayoutRight,SpaceDst> f =
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceSrc>("f",1);
+  Kokkos::View< double*, Kokkos::LayoutRight, SpaceDst > f =
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "f", 1 );
 }
 
-
 } // namespace Test
-
-/*--------------------------------------------------------------------------*/
-
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
index 1c2575b6f61c9fa11b28963852085960ecc420aa..386301b45dbc9f9d6bb5770133d818a7eccba40e 100644
--- a/lib/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,64 +48,68 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 namespace TestViewSubview {
 
-template<class Layout, class Space>
+template< class Layout, class Space >
 struct getView {
   static
-    Kokkos::View<double**,Layout,Space> get(int n, int m) {
-      return Kokkos::View<double**,Layout,Space>("G",n,m);
+    Kokkos::View< double**, Layout, Space > get( int n, int m ) {
+      return Kokkos::View< double**, Layout, Space >( "G", n, m );
   }
 };
 
-template<class Space>
-struct getView<Kokkos::LayoutStride,Space> {
+template< class Space >
+struct getView< Kokkos::LayoutStride, Space > {
   static
-    Kokkos::View<double**,Kokkos::LayoutStride,Space> get(int n, int m) {
-      const int rank = 2 ;
+    Kokkos::View< double**, Kokkos::LayoutStride, Space > get( int n, int m ) {
+      const int rank = 2;
       const int order[] = { 0, 1 };
-      const unsigned dim[] = { unsigned(n), unsigned(m) };
-      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank , order , dim );
-      return Kokkos::View<double**,Kokkos::LayoutStride,Space>("G",stride);
+      const unsigned dim[] = { unsigned( n ), unsigned( m ) };
+      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank, order, dim );
+
+      return Kokkos::View< double**, Kokkos::LayoutStride, Space >( "G", stride );
   }
 };
 
-template<class ViewType, class Space>
+template< class ViewType, class Space >
 struct fill_1D {
   typedef typename Space::execution_space execution_space;
   typedef typename ViewType::size_type size_type;
+
   ViewType a;
   double val;
-  fill_1D(ViewType a_, double val_):a(a_),val(val_) {
-  }
+
+  fill_1D( ViewType a_, double val_ ) : a( a_ ), val( val_ ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int i) const {
-    a(i) = val;
-  }
+  void operator()( const int i ) const { a( i ) = val; }
 };
 
-template<class ViewType, class Space>
+template< class ViewType, class Space >
 struct fill_2D {
   typedef typename Space::execution_space execution_space;
   typedef typename ViewType::size_type size_type;
+
   ViewType a;
   double val;
-  fill_2D(ViewType a_, double val_):a(a_),val(val_) {
-  }
+
+  fill_2D( ViewType a_, double val_ ) : a( a_ ), val( val_ ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int i) const{
-    for(int j = 0; j < static_cast<int>(a.dimension_1()); j++)
-      a(i,j) = val;
+  void operator()( const int i ) const
+  {
+    for ( int j = 0; j < static_cast< int >( a.dimension_1() ); j++ ) {
+      a( i, j ) = val;
+    }
   }
 };
 
-template<class Layout, class Space>
+template< class Layout, class Space >
 void test_auto_1d ()
 {
-  typedef Kokkos::View<double**, Layout, Space> mv_type;
+  typedef Kokkos::View< double**, Layout, Space > mv_type;
   typedef typename mv_type::size_type size_type;
+
   const double ZERO = 0.0;
   const double ONE = 1.0;
   const double TWO = 2.0;
@@ -113,359 +117,359 @@ void test_auto_1d ()
   const size_type numRows = 10;
   const size_type numCols = 3;
 
-  mv_type X = getView<Layout,Space>::get(numRows, numCols);
-  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view (X);
+  mv_type X = getView< Layout, Space >::get( numRows, numCols );
+  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view( X );
 
-  fill_2D<mv_type,Space> f1(X, ONE);
-  Kokkos::parallel_for(X.dimension_0(),f1);
-  Kokkos::deep_copy (X_h, X);
-  for (size_type j = 0; j < numCols; ++j) {
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == ONE);
+  fill_2D< mv_type, Space > f1( X, ONE );
+  Kokkos::parallel_for( X.dimension_0(), f1 );
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ONE );
     }
   }
 
-  fill_2D<mv_type,Space> f2(X, 0.0);
-  Kokkos::parallel_for(X.dimension_0(),f2);
-  Kokkos::deep_copy (X_h, X);
-  for (size_type j = 0; j < numCols; ++j) {
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == ZERO);
+  fill_2D< mv_type, Space > f2( X, 0.0 );
+  Kokkos::parallel_for( X.dimension_0(), f2 );
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ZERO );
     }
   }
 
-  fill_2D<mv_type,Space> f3(X, TWO);
-  Kokkos::parallel_for(X.dimension_0(),f3);
-  Kokkos::deep_copy (X_h, X);
-  for (size_type j = 0; j < numCols; ++j) {
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == TWO);
+  fill_2D< mv_type, Space > f3( X, TWO );
+  Kokkos::parallel_for( X.dimension_0(), f3 );
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == TWO );
     }
   }
 
-  for (size_type j = 0; j < numCols; ++j) {
-    auto X_j = Kokkos::subview (X, Kokkos::ALL, j);
+  for ( size_type j = 0; j < numCols; ++j ) {
+    auto X_j = Kokkos::subview( X, Kokkos::ALL, j );
 
-    fill_1D<decltype(X_j),Space> f4(X_j, ZERO);
-    Kokkos::parallel_for(X_j.dimension_0(),f4);
-    Kokkos::deep_copy (X_h, X);
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == ZERO);
+    fill_1D< decltype( X_j ), Space > f4( X_j, ZERO );
+    Kokkos::parallel_for( X_j.dimension_0(), f4 );
+    Kokkos::deep_copy( X_h, X );
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ZERO );
     }
 
-    for (size_type jj = 0; jj < numCols; ++jj) {
-      auto X_jj = Kokkos::subview (X, Kokkos::ALL, jj);
-      fill_1D<decltype(X_jj),Space> f5(X_jj, ONE);
-      Kokkos::parallel_for(X_jj.dimension_0(),f5);
-      Kokkos::deep_copy (X_h, X);
-      for (size_type i = 0; i < numRows; ++i) {
-        ASSERT_TRUE(X_h(i,jj) == ONE);
+    for ( size_type jj = 0; jj < numCols; ++jj ) {
+      auto X_jj = Kokkos::subview ( X, Kokkos::ALL, jj );
+      fill_1D< decltype( X_jj ), Space > f5( X_jj, ONE );
+      Kokkos::parallel_for( X_jj.dimension_0(), f5 );
+      Kokkos::deep_copy( X_h, X );
+      for ( size_type i = 0; i < numRows; ++i ) {
+        ASSERT_TRUE( X_h( i, jj ) == ONE );
       }
     }
   }
 }
 
-template<class LD, class LS, class Space>
-void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n, int m) {
-  Kokkos::View<double**,LS,Space> l2d("l2d",n,m);
+template< class LD, class LS, class Space >
+void test_1d_strided_assignment_impl( bool a, bool b, bool c, bool d, int n, int m ) {
+  Kokkos::View< double**, LS, Space > l2d( "l2d", n, m );
 
-  int col = n>2?2:0;
-  int row = m>2?2:0;
+  int col = n > 2 ? 2 : 0;
+  int row = m > 2 ? 2 : 0;
 
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
-  if(a) {
-    Kokkos::View<double*,LD,Space> l1da = Kokkos::subview(l2d,Kokkos::ALL,row);
-    ASSERT_TRUE( & l1da(0) == & l2d(0,row) );
-    if(n>1)
-      ASSERT_TRUE( & l1da(1) == & l2d(1,row) );
-  }
-  if(b && n>13) {
-    Kokkos::View<double*,LD,Space> l1db = Kokkos::subview(l2d,std::pair<unsigned,unsigned>(2,13),row);
-    ASSERT_TRUE( & l1db(0) == & l2d(2,row) );
-    ASSERT_TRUE( & l1db(1) == & l2d(3,row) );
-  }
-  if(c) {
-    Kokkos::View<double*,LD,Space> l1dc = Kokkos::subview(l2d,col,Kokkos::ALL);
-    ASSERT_TRUE( & l1dc(0) == & l2d(col,0) );
-    if(m>1)
-      ASSERT_TRUE( & l1dc(1) == & l2d(col,1) );
-  }
-  if(d && m>13) {
-    Kokkos::View<double*,LD,Space> l1dd = Kokkos::subview(l2d,col,std::pair<unsigned,unsigned>(2,13));
-    ASSERT_TRUE( & l1dd(0) == & l2d(col,2) );
-    ASSERT_TRUE( & l1dd(1) == & l2d(col,3) );
-  }
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    if ( a ) {
+      Kokkos::View< double*, LD, Space > l1da = Kokkos::subview( l2d, Kokkos::ALL, row );
+      ASSERT_TRUE( & l1da( 0 ) == & l2d( 0, row ) );
+      if ( n > 1 ) {
+        ASSERT_TRUE( & l1da( 1 ) == & l2d( 1, row ) );
+      }
+    }
+
+    if ( b && n > 13 ) {
+      Kokkos::View< double*, LD, Space > l1db = Kokkos::subview( l2d, std::pair< unsigned, unsigned >( 2, 13 ), row );
+      ASSERT_TRUE( & l1db( 0 ) == & l2d( 2, row ) );
+      ASSERT_TRUE( & l1db( 1 ) == & l2d( 3, row ) );
+    }
+
+    if ( c ) {
+      Kokkos::View< double*, LD, Space > l1dc = Kokkos::subview( l2d, col, Kokkos::ALL );
+      ASSERT_TRUE( & l1dc( 0 ) == & l2d( col, 0 ) );
+      if( m > 1 ) {
+        ASSERT_TRUE( & l1dc( 1 ) == & l2d( col, 1 ) );
+      }
+    }
+
+    if ( d && m > 13 ) {
+      Kokkos::View< double*, LD, Space > l1dd = Kokkos::subview( l2d, col, std::pair< unsigned, unsigned >( 2, 13 ) );
+      ASSERT_TRUE( & l1dd( 0 ) == & l2d( col, 2 ) );
+      ASSERT_TRUE( & l1dd( 1 ) == & l2d( col, 3 ) );
+    }
   }
 
 }
 
-template<class Space >
+template< class Space >
 void test_1d_strided_assignment() {
-  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutLeft,Space>(true,true,true,true,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutRight,Space>(true,true,true,true,17,3);
-
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
-
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
-
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
+  test_1d_strided_assignment_impl< Kokkos::LayoutStride, Kokkos::LayoutLeft, Space >( true, true, true, true, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutStride, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 3 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( false, false, true, true, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( false, false, true, true, 17, 3 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 1 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 1 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 1 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( false, false, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( false, false, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 1 );
 }
 
 template< class Space >
 void test_left_0()
 {
-  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutLeft , Space >
-    view_static_8_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space > view_static_8_type;
 
-  view_static_8_type  x_static_8("x_static_left_8");
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_static_8_type x_static_8( "x_static_left_8" );
 
-  ASSERT_TRUE( x_static_8.is_contiguous() );
+    ASSERT_TRUE( x_static_8.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x_static_8, 0, 0, 0, 0, 0, 0, 0, 0 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & x_static_8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x_static_8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3, 0, 1, 2, 3 );
 
-  ASSERT_TRUE( x1.is_contiguous() );
-  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x1(1) == & x_static_8(1,1,2,3,0,1,2,3) );
+    ASSERT_TRUE( x1.is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x_static_8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3
+                                 , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,3,1,1,2,3) );
-  ASSERT_TRUE( & x2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x_static_8( 0, 1, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x_static_8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                                    , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! sx2.is_contiguous() );
-  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! sx2.is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x_static_8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x_static_8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x_static_8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                                 , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
 
-  ASSERT_TRUE( ! sx4.is_contiguous() );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
-  }
+    ASSERT_TRUE( ! sx4.is_contiguous() );
 
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x_static_8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_left_1()
 {
-  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutLeft , Space >
-    view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int ****[2][3][4][5], Kokkos::LayoutLeft, Space > view_type;
 
-  view_type  x8("x_left_8",2,3,4,5);
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type x8( "x_left_8", 2, 3, 4, 5 );
 
-  ASSERT_TRUE( x8.is_contiguous() );
+    ASSERT_TRUE( x8.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x8, 0, 0, 0, 0, 0, 0, 0, 0 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & x8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3, 0, 1, 2, 3 );
 
-  ASSERT_TRUE( x1.is_contiguous() );
-  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x1(1) == & x8(1,1,2,3,0,1,2,3) );
+    ASSERT_TRUE( x1.is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3
+                         , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(1,0) == & x8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,3,1,1,2,3) );
-  ASSERT_TRUE( & x2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x8( 0, 1, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                            , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! sx2.is_contiguous() );
-  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! sx2.is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                         , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
 
-  ASSERT_TRUE( ! sx4.is_contiguous() );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
-  }
+    ASSERT_TRUE( ! sx4.is_contiguous() );
 
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_left_2()
 {
-  typedef Kokkos::View< int **** , Kokkos::LayoutLeft , Space > view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
-
-  view_type  x4("x4",2,3,4,5);
-
-  ASSERT_TRUE( x4.is_contiguous() );
-
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x4 , 0, 0, 0, 0 );
-
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & x4(0,0,0,0) );
-
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, 2, 3 );
-
-  ASSERT_TRUE( x1.is_contiguous() );
-  ASSERT_TRUE( & x1(0) == & x4(0,1,2,3) );
-  ASSERT_TRUE( & x1(1) == & x4(1,1,2,3) );
-
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, Kokkos::pair<int,int>(1,3), 2 );
-
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  ASSERT_TRUE( & x2(0,0) == & x4(0,1,1,2) );
-  ASSERT_TRUE( & x2(1,0) == & x4(1,1,1,2) );
-  ASSERT_TRUE( & x2(0,1) == & x4(0,1,2,2) );
-  ASSERT_TRUE( & x2(1,1) == & x4(1,1,2,2) );
-
-  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x4, 1, Kokkos::pair<int,int>(0,2)
-                       , 2, Kokkos::pair<int,int>(1,4) );
-
-  ASSERT_TRUE( ! sx2.is_contiguous() );
-  ASSERT_TRUE( & sx2(0,0) == & x4(1,0,2,1) );
-  ASSERT_TRUE( & sx2(1,0) == & x4(1,1,2,1) );
-  ASSERT_TRUE( & sx2(0,1) == & x4(1,0,2,2) );
-  ASSERT_TRUE( & sx2(1,1) == & x4(1,1,2,2) );
-  ASSERT_TRUE( & sx2(0,2) == & x4(1,0,2,3) );
-  ASSERT_TRUE( & sx2(1,2) == & x4(1,1,2,3) );
-
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x4, Kokkos::pair<int,int>(1,2) /* of [2] */
-                       , Kokkos::pair<int,int>(1,3) /* of [3] */
-                       , Kokkos::pair<int,int>(0,4) /* of [4] */
-                       , Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
-
-  ASSERT_TRUE( ! sx4.is_contiguous() );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x4( 1+i0, 1+i1, 0+i2, 2+i3 ) );
-  }
-
+  typedef Kokkos::View< int ****, Kokkos::LayoutLeft, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_type x4( "x4", 2, 3, 4, 5 );
+
+    ASSERT_TRUE( x4.is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x4, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & x4( 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( x1.is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x4( 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x4( 1, 1, 2, 3 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 0, 2 ), 1
+                         , Kokkos::pair< int, int >( 1, 3 ), 2 );
+
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x4( 0, 1, 1, 2 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x4( 1, 1, 1, 2 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x4( 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x4( 1, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x4, 1, Kokkos::pair< int, int >( 0, 2 )
+                         , 2, Kokkos::pair< int, int >( 1, 4 ) );
+
+    ASSERT_TRUE( ! sx2.is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x4( 1, 0, 2, 1 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x4( 1, 1, 2, 1 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x4( 1, 0, 2, 2 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x4( 1, 1, 2, 2 ) );
+    ASSERT_TRUE( & sx2( 0, 2 ) == & x4( 1, 0, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 2 ) == & x4( 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 1, 2 ) /* of [2] */
+                         , Kokkos::pair< int, int >( 1, 3 ) /* of [3] */
+                         , Kokkos::pair< int, int >( 0, 4 ) /* of [4] */
+                         , Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( ! sx4.is_contiguous() );
+
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x4( 1 + i0, 1 + i1, 0 + i2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_left_3()
 {
-  typedef Kokkos::View< int ** , Kokkos::LayoutLeft , Space > view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int **, Kokkos::LayoutLeft, Space > view_type;
 
-  view_type  xm("x4",10,5);
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type xm( "x4", 10, 5 );
 
-  ASSERT_TRUE( xm.is_contiguous() );
+    ASSERT_TRUE( xm.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( xm , 5, 3 );
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( xm, 5, 3 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & xm(5,3) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & xm( 5, 3 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( xm, Kokkos::ALL, 3 );
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 = Kokkos::subview( xm, Kokkos::ALL, 3 );
 
-  ASSERT_TRUE( x1.is_contiguous() );
-  for ( int i = 0 ; i < int(xm.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x1(i) == & xm(i,3) );
-  }
-
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), Kokkos::ALL );
+    ASSERT_TRUE( x1.is_contiguous() );
+    for ( int i = 0; i < int( xm.dimension_0() ); ++i ) {
+      ASSERT_TRUE( & x1( i ) == & xm( i, 3 ) );
+    }
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2(i,j) == & xm(1+i,j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( xm, Kokkos::pair< int, int >( 1, 9 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2c =
-    Kokkos::subview( xm, Kokkos::ALL, std::pair<int,int>(2,4) );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    for ( int j = 0; j < int( x2.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2.dimension_0() ); ++i )
+    {
+      ASSERT_TRUE( & x2( i, j ) == & xm( 1 + i, j ) );
+    }
 
-  ASSERT_TRUE( x2c.is_contiguous() );
-  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2c(i,j) == & xm(i,2+j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2c =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 2, 4 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n1 =
-    Kokkos::subview( xm , std::pair<int,int>(1,1) , Kokkos::ALL );
+    ASSERT_TRUE( x2c.is_contiguous() );
+    for ( int j = 0; j < int( x2c.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2c.dimension_0() ); ++i )
+    {
+      ASSERT_TRUE( & x2c( i, j ) == & xm( i, 2 + j ) );
+    }
 
-  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
-  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2_n1 =
+      Kokkos::subview( xm, std::pair< int, int >( 1, 1 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n2 =
-    Kokkos::subview( xm , Kokkos::ALL , std::pair<int,int>(1,1) );
+    ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+    ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
 
-  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
-  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2_n2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 1, 1 ) );
 
+    ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+    ASSERT_TRUE( x2_n2.dimension_1() == 0 );
   }
 }
 
@@ -474,766 +478,814 @@ void test_left_3()
 template< class Space >
 void test_right_0()
 {
-  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutRight , Space >
-    view_static_8_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
-
-  view_static_8_type  x_static_8("x_static_right_8");
-
-  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
-
-  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
-
-  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
-    Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
-
-  ASSERT_TRUE( x1.dimension_0() == 2 );
-  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,1) );
-  ASSERT_TRUE( & x1(1) == & x_static_8(0,1,2,3,0,1,2,2) );
-
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
-    Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
-                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
-
-  ASSERT_TRUE( x2.dimension_0() == 2 );
-  ASSERT_TRUE( x2.dimension_1() == 2 );
-  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,1,0,1,2,1) );
-  ASSERT_TRUE( & x2(1,0) == & x_static_8(0,1,2,2,0,1,2,1) );
-  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,1,0,1,2,2) );
-  ASSERT_TRUE( & x2(1,1) == & x_static_8(0,1,2,2,0,1,2,2) );
-
-  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
-
-  ASSERT_TRUE( sx2.dimension_0() == 2 );
-  ASSERT_TRUE( sx2.dimension_1() == 2 );
-  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
-
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
-
-  ASSERT_TRUE( sx4.dimension_0() == 2 );
-  ASSERT_TRUE( sx4.dimension_1() == 2 );
-  ASSERT_TRUE( sx4.dimension_2() == 2 );
-  ASSERT_TRUE( sx4.dimension_3() == 2 );
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0, 0+i0, 1, 1+i1, 1, 0+i2, 2, 2+i3) );
-  }
-
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space > view_static_8_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_static_8_type x_static_8( "x_static_right_8" );
+
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( x_static_8, 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( & x0() == & x_static_8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 =
+      Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( x1.dimension_0() == 2 );
+    ASSERT_TRUE( & x1( 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 2 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 )
+                                 , 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( x2.dimension_0() == 2 );
+    ASSERT_TRUE( x2.dimension_1() == 2 );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x_static_8( 0, 1, 2, 1, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x_static_8( 0, 1, 2, 2, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x_static_8( 0, 1, 2, 1, 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x_static_8( 0, 1, 2, 2, 0, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x_static_8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                                    , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( sx2.dimension_0() == 2 );
+    ASSERT_TRUE( sx2.dimension_1() == 2 );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x_static_8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x_static_8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x_static_8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                                 , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( sx4.dimension_0() == 2 );
+    ASSERT_TRUE( sx4.dimension_1() == 2 );
+    ASSERT_TRUE( sx4.dimension_2() == 2 );
+    ASSERT_TRUE( sx4.dimension_3() == 2 );
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x_static_8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_right_1()
 {
-  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutRight , Space >
-    view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int ****[2][3][4][5], Kokkos::LayoutRight, Space > view_type;
 
-  view_type  x8("x_right_8",2,3,4,5);
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_type x8( "x_right_8", 2, 3, 4, 5 );
 
-  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( x8, 0, 0, 0, 0, 0, 0, 0, 0 );
 
-  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+    ASSERT_TRUE( & x0() == & x8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
-    Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 =
+      Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
 
-  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,1) );
-  ASSERT_TRUE( & x1(1) == & x8(0,1,2,3,0,1,2,2) );
+    ASSERT_TRUE( & x1( 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 2 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
-    Kokkos::subview( x8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
-                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( x8, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 )
+                         , 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
 
-  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,1,0,1,2,1) );
-  ASSERT_TRUE( & x2(1,0) == & x8(0,1,2,2,0,1,2,1) );
-  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,1,0,1,2,2) );
-  ASSERT_TRUE( & x2(1,1) == & x8(0,1,2,2,0,1,2,2) );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x8( 0, 1, 2, 1, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x8( 0, 1, 2, 2, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x8( 0, 1, 2, 1, 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x8( 0, 1, 2, 2, 0, 1, 2, 2 ) );
 
-  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                            , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
-  }
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                         , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
 
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_right_3()
 {
-  typedef Kokkos::View< int ** , Kokkos::LayoutRight , Space > view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int **, Kokkos::LayoutRight, Space > view_type;
 
-  view_type  xm("x4",10,5);
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type xm( "x4", 10, 5 );
 
-  ASSERT_TRUE( xm.is_contiguous() );
+    ASSERT_TRUE( xm.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( xm , 5, 3 );
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( xm, 5, 3 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & xm(5,3) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & xm( 5, 3 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
-    Kokkos::subview( xm, 3, Kokkos::ALL );
-
-  ASSERT_TRUE( x1.is_contiguous() );
-  for ( int i = 0 ; i < int(xm.dimension_1()) ; ++i ) {
-    ASSERT_TRUE( & x1(i) == & xm(3,i) );
-  }
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 = Kokkos::subview( xm, 3, Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2c =
-    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), Kokkos::ALL );
+    ASSERT_TRUE( x1.is_contiguous() );
+    for ( int i = 0; i < int( xm.dimension_1() ); ++i ) {
+      ASSERT_TRUE( & x1( i ) == & xm( 3, i ) );
+    }
 
-  ASSERT_TRUE( x2c.is_contiguous() );
-  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2c(i,j) == & xm(1+i,j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2c =
+      Kokkos::subview( xm, Kokkos::pair< int, int >( 1, 9 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
-    Kokkos::subview( xm, Kokkos::ALL, std::pair<int,int>(2,4) );
+    ASSERT_TRUE( x2c.is_contiguous() );
+    for ( int j = 0; j < int( x2c.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2c.dimension_0() ); ++i ) {
+      ASSERT_TRUE( & x2c( i, j ) == & xm( 1 + i, j ) );
+    }
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2(i,j) == & xm(i,2+j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 2, 4 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n1 =
-    Kokkos::subview( xm , std::pair<int,int>(1,1) , Kokkos::ALL );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    for ( int j = 0; j < int( x2.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2.dimension_0() ); ++i )
+    {
+      ASSERT_TRUE( & x2( i, j ) == & xm( i, 2 + j ) );
+    }
 
-  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
-  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2_n1 =
+      Kokkos::subview( xm, std::pair< int, int >( 1, 1 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n2 =
-    Kokkos::subview( xm , Kokkos::ALL , std::pair<int,int>(1,1) );
+    ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+    ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
 
-  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
-  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2_n2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 1, 1 ) );
 
+    ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+    ASSERT_TRUE( x2_n2.dimension_1() == 0 );
   }
 }
 
 namespace Impl {
 
-constexpr int N0=113;
-constexpr int N1=11;
-constexpr int N2=17;
-constexpr int N3=5;
-constexpr int N4=7;
+constexpr int N0 = 113;
+constexpr int N1 = 11;
+constexpr int N2 = 17;
+constexpr int N3 = 5;
+constexpr int N4 = 7;
 
-template<class SubView,class View>
-void test_Check1D(SubView a, View b, std::pair<int,int> range) {
+template< class SubView, class View >
+void test_Check1D( SubView a, View b, std::pair< int, int > range ) {
   int errors = 0;
-  for(int i=0;i<range.second-range.first;i++) {
-    if(a(i)!=b(i+range.first))
-      errors++;
+
+  for ( int i = 0; i < range.second - range.first; i++ ) {
+    if ( a( i ) != b( i + range.first ) ) errors++;
+  }
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check1D: " << errors << std::endl;
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check1D: " << errors <<std::endl;
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class SubView,class View>
-void test_Check1D2D(SubView a, View b, int i0, std::pair<int,int> range) {
+template< class SubView, class View >
+void test_Check1D2D( SubView a, View b, int i0, std::pair< int, int > range ) {
   int errors = 0;
-  for(int i1=0;i1<range.second-range.first;i1++) {
-    if(a(i1)!=b(i0,i1+range.first))
-      errors++;
+
+  for ( int i1 = 0; i1 < range.second - range.first; i1++ ) {
+    if ( a( i1 ) != b( i0, i1 + range.first ) ) errors++;
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check1D2D: " << errors <<std::endl;
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check1D2D: " << errors << std::endl;
+  }
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class SubView,class View>
-void test_Check2D3D(SubView a, View b, int i0, std::pair<int,int> range1, std::pair<int,int> range2) {
+template< class SubView, class View >
+void test_Check2D3D( SubView a, View b, int i0, std::pair< int, int > range1
+                   , std::pair< int, int > range2 )
+{
   int errors = 0;
-  for(int i1=0;i1<range1.second-range1.first;i1++) {
-    for(int i2=0;i2<range2.second-range2.first;i2++) {
-      if(a(i1,i2)!=b(i0,i1+range1.first,i2+range2.first))
-        errors++;
+
+  for ( int i1 = 0; i1 < range1.second - range1.first; i1++ ) {
+    for ( int i2 = 0; i2 < range2.second - range2.first; i2++ ) {
+      if ( a( i1, i2 ) != b( i0, i1 + range1.first, i2 + range2.first ) ) errors++;
     }
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check2D3D: " << errors <<std::endl;
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check2D3D: " << errors << std::endl;
+  }
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class SubView,class View>
-void test_Check3D5D(SubView a, View b, int i0, int i1, std::pair<int,int> range2, std::pair<int,int> range3, std::pair<int,int> range4) {
+template<class SubView, class View>
+void test_Check3D5D( SubView a, View b, int i0, int i1, std::pair< int, int > range2
+                   , std::pair< int, int > range3, std::pair< int, int > range4 )
+{
   int errors = 0;
-  for(int i2=0;i2<range2.second-range2.first;i2++) {
-    for(int i3=0;i3<range3.second-range3.first;i3++) {
-      for(int i4=0;i4<range4.second-range4.first;i4++) {
-        if(a(i2,i3,i4)!=b(i0,i1,i2+range2.first,i3+range3.first,i4+range4.first))
+
+  for ( int i2 = 0; i2 < range2.second - range2.first; i2++ ) {
+    for ( int i3 = 0; i3 < range3.second - range3.first; i3++ ) {
+      for ( int i4 = 0; i4 < range4.second - range4.first; i4++ ) {
+        if ( a( i2, i3, i4 ) != b( i0, i1, i2 + range2.first, i3 + range3.first, i4 + range4.first ) ) {
           errors++;
+        }
       }
     }
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check3D5D: " << errors <<std::endl;
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check3D5D: " << errors << std::endl;
+  }
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_1d_assign_impl() {
-
-  { //Breaks
-    Kokkos::View<int*,LayoutOrg,Space> a_org("A",N0);
-    Kokkos::View<int*,LayoutOrg,Space,MemTraits> a(a_org);
+  { // Breaks.
+    Kokkos::View< int*, LayoutOrg, Space > a_org( "A", N0 );
+    Kokkos::View< int*, LayoutOrg, Space, MemTraits > a( a_org );
     Kokkos::fence();
-    for(int i=0; i<N0; i++)
-      a_org(i) = i;
+    for ( int i = 0; i < N0; i++ ) a_org( i ) = i;
 
-    Kokkos::View<int[N0],Layout,Space,MemTraits> a1(a);
+    Kokkos::View< int[N0], Layout, Space, MemTraits > a1( a );
     Kokkos::fence();
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
 
-    Kokkos::View<int[N0],LayoutSub,Space,MemTraits> a2(a1);
+    Kokkos::View< int[N0], LayoutSub, Space, MemTraits > a2( a1 );
     Kokkos::fence();
-    test_Check1D(a2,a,std::pair<int,int>(0,N0));
+    test_Check1D( a2, a, std::pair< int, int >( 0, N0 ) );
     a1 = a;
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
 
-    //Runtime Fail expected
-    //Kokkos::View<int[N1]> afail1(a);
+    // Runtime Fail expected.
+    //Kokkos::View< int[N1] > afail1( a );
 
-    //Compile Time Fail expected
-    //Kokkos::View<int[N1]> afail2(a1);
+    // Compile Time Fail expected.
+    //Kokkos::View< int[N1] > afail2( a1 );
   }
 
-  { // Works
-    Kokkos::View<int[N0],LayoutOrg,Space,MemTraits> a("A");
-    Kokkos::View<int*,Layout,Space,MemTraits> a1(a);
+  { // Works.
+    Kokkos::View< int[N0], LayoutOrg, Space, MemTraits > a( "A" );
+    Kokkos::View< int*, Layout, Space, MemTraits > a1( a );
     Kokkos::fence();
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
     a1 = a;
     Kokkos::fence();
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
   }
 }
 
-template<class Space, class Type, class TypeSub,class LayoutSub, class Layout, class LayoutOrg,class MemTraits>
+template< class Space, class Type, class TypeSub, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_2d_subview_3d_impl_type() {
-  Kokkos::View<int***,LayoutOrg,Space> a_org("A",N0,N1,N2);
-  Kokkos::View<Type,Layout,Space,MemTraits> a(a_org);
-  for(int i0=0; i0<N0; i0++)
-    for(int i1=0; i1<N1; i1++)
-      for(int i2=0; i2<N2; i2++)
-        a_org(i0,i1,i2) = i0*1000000+i1*1000+i2;
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a1;
-  a1 = Kokkos::subview(a,3,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< int***, LayoutOrg, Space > a_org( "A", N0, N1, N2 );
+  Kokkos::View< Type, Layout, Space, MemTraits > a( a_org );
+
+  for ( int i0 = 0; i0 < N0; i0++ )
+  for ( int i1 = 0; i1 < N1; i1++ )
+  for ( int i2 = 0; i2 < N2; i2++ )
+  {
+    a_org( i0, i1, i2 ) = i0 * 1000000 + i1 * 1000 + i2;
+  }
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a1;
+  a1 = Kokkos::subview( a, 3, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check2D3D(a1,a,3,std::pair<int,int>(0,N1),std::pair<int,int>(0,N2));
+  test_Check2D3D( a1, a, 3, std::pair< int, int >( 0, N1 ), std::pair< int, int >( 0, N2 ) );
 
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a2(a,3,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a2( a, 3, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check2D3D(a2,a,3,std::pair<int,int>(0,N1),std::pair<int,int>(0,N2));
+  test_Check2D3D( a2, a, 3, std::pair< int, int >( 0, N1 ), std::pair< int, int >( 0, N2 ) );
 }
 
-template<class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_2d_subview_3d_impl_layout() {
-  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,int**      [N2],int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int**      [N2],int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int**      [N2],int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,int***         ,int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int***         ,int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int***         ,int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int***         , int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int***         , int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int***         , int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int[N0][N1][N2],const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int[N0][N1][N2],const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int[N0][N1][N2],const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int*   [N1][N2],const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int*   [N1][N2],const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int*   [N1][N2],const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int**      [N2],const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int**      [N2],const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int**      [N2],const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int***         ,const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int***         ,const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int***         ,const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 }
 
-template<class Space, class Type, class TypeSub,class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class Type, class TypeSub, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_3d_subview_5d_impl_type() {
-  Kokkos::View<int*****,LayoutOrg,Space> a_org("A",N0,N1,N2,N3,N4);
-  Kokkos::View<Type,Layout,Space,MemTraits> a(a_org);
-  for(int i0=0; i0<N0; i0++)
-    for(int i1=0; i1<N1; i1++)
-      for(int i2=0; i2<N2; i2++)
-        for(int i3=0; i3<N3; i3++)
-          for(int i4=0; i4<N4; i4++)
-            a_org(i0,i1,i2,i3,i4) = i0*1000000+i1*10000+i2*100+i3*10+i4;
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a1;
-  a1 = Kokkos::subview(a,3,5,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< int*****, LayoutOrg, Space > a_org( "A", N0, N1, N2, N3, N4 );
+  Kokkos::View< Type, Layout, Space, MemTraits > a( a_org );
+
+  for ( int i0 = 0; i0 < N0; i0++ )
+  for ( int i1 = 0; i1 < N1; i1++ )
+  for ( int i2 = 0; i2 < N2; i2++ )
+  for ( int i3 = 0; i3 < N3; i3++ )
+  for ( int i4 = 0; i4 < N4; i4++ )
+  {
+    a_org( i0, i1, i2, i3, i4 ) = i0 * 1000000 + i1 * 10000 + i2 * 100 + i3 * 10 + i4;
+  }
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a1;
+  a1 = Kokkos::subview( a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check3D5D(a1,a,3,5,std::pair<int,int>(0,N2),std::pair<int,int>(0,N3),std::pair<int,int>(0,N4));
+  test_Check3D5D( a1, a, 3, 5, std::pair< int, int >( 0, N2 ), std::pair< int, int >( 0, N3 ), std::pair< int, int >( 0, N4 ) );
 
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a2(a,3,5,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a2( a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check3D5D(a2,a,3,5,std::pair<int,int>(0,N2),std::pair<int,int>(0,N3),std::pair<int,int>(0,N4));
+  test_Check3D5D( a2, a, 3, 5, std::pair< int, int >( 0, N2 ), std::pair< int, int >( 0, N3 ), std::pair< int, int >( 0, N4 ) );
 }
 
-template<class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_3d_subview_5d_impl_layout() {
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int*****               ,int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*****               ,int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*****               ,int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*****               ,int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int*****               , int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
 }
 
 inline
 void test_subview_legal_args_right() {
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
 }
 
 inline
 void test_subview_legal_args_left() {
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, (  Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
 }
 
-}
+} // namespace Impl
 
-template< class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_1d_assign() {
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft, MemTraits>();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  >();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft, MemTraits>();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutRight ,Kokkos::LayoutLeft  >();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutRight ,Kokkos::LayoutRight, MemTraits>();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutRight ,Kokkos::LayoutRight, MemTraits>();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft, MemTraits>();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::LayoutLeft, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::LayoutLeft >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft, MemTraits >();
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_2d_subview_3d() {
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutRight ,Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutRight, MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft,  MemTraits>();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft,  MemTraits >();
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_3d_subview_5d_right() {
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits>();
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutRight, MemTraits>();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutRight, MemTraits >();
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_3d_subview_5d_left() {
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits>();
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft,  MemTraits>();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits >();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft,  MemTraits >();
 }
 
+namespace Impl {
 
+template< class Layout, class Space >
+struct FillView_3D {
+  Kokkos::View< int***, Layout, Space > a;
 
-namespace Impl {
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const
+  {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % a.dimension_0()
+                : ii / ( a.dimension_1() * a.dimension_2() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / a.dimension_0() ) % a.dimension_1()
+                : ( ii / a.dimension_2() ) % a.dimension_1();
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( a.dimension_0() * a.dimension_1() )
+                : ii % a.dimension_2();
 
-  template<class Layout, class Space>
-  struct FillView_3D {
-    Kokkos::View<int***,Layout,Space> a;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        ii % a.dimension_0(): ii / (a.dimension_1()*a.dimension_2());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / a.dimension_0()) % a.dimension_1() : (ii / a.dimension_2()) % a.dimension_1();
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        ii / (a.dimension_0() * a.dimension_1()) : ii % a.dimension_2();
-      a(i,j,k) = 1000000 * i + 1000 * j + k;
+    a( i, j, k ) = 1000000 * i + 1000 * j + k;
+  }
+};
+
+template< class Layout, class Space >
+struct FillView_4D {
+  Kokkos::View< int****, Layout, Space > a;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+              ? ii % a.dimension_0()
+              : ii / ( a.dimension_1() * a.dimension_2() * a.dimension_3() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+              ? ( ii / a.dimension_0() ) % a.dimension_1()
+              : ( ii / ( a.dimension_2() * a.dimension_3() ) % a.dimension_1() );
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+              ? ( ii / ( a.dimension_0() * a.dimension_1() ) ) % a.dimension_2()
+              : ( ii / a.dimension_3() ) % a.dimension_2();
+
+    const int l = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( a.dimension_0() * a.dimension_1() * a.dimension_2() )
+                : ii % a.dimension_3();
+
+    a( i, j, k, l ) = 1000000 * i + 10000 * j + 100 * k + l;
+  }
+};
+
+template< class Layout, class Space, class MemTraits >
+struct CheckSubviewCorrectness_3D_3D {
+  Kokkos::View< const int***, Layout, Space, MemTraits > a;
+  Kokkos::View< const int***, Layout, Space, MemTraits > b;
+  int offset_0, offset_2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const
+  {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % b.dimension_0()
+                : ii / ( b.dimension_1() * b.dimension_2() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / b.dimension_0() ) % b.dimension_1()
+                : ( ii / b.dimension_2() ) % b.dimension_1();
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( b.dimension_0() * b.dimension_1() )
+                : ii % b.dimension_2();
+
+    if ( a( i + offset_0, j, k + offset_2 ) != b( i, j, k ) ) {
+      Kokkos::abort( "Error: check_subview_correctness 3D-3D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)" );
     }
-  };
-
-  template<class Layout, class Space>
-  struct FillView_4D {
-    Kokkos::View<int****,Layout,Space> a;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-         ii % a.dimension_0(): ii / (a.dimension_1()*a.dimension_2()*a.dimension_3());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / a.dimension_0()) % a.dimension_1() : (ii / (a.dimension_2()*a.dimension_3()) % a.dimension_1());
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        (ii / (a.dimension_0() * a.dimension_1())) % a.dimension_2() : (ii / a.dimension_3()) % a.dimension_2();
-      const int l = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-         ii / (a.dimension_0() * a.dimension_1() * a.dimension_2()) : ii % a.dimension_3();
-      a(i,j,k,l) = 1000000 * i + 10000 * j + 100 * k + l;
+  }
+};
+
+template< class Layout, class Space, class MemTraits >
+struct CheckSubviewCorrectness_3D_4D {
+  Kokkos::View< const int****, Layout, Space, MemTraits > a;
+  Kokkos::View< const int***, Layout, Space, MemTraits > b;
+  int offset_0, offset_2, index;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % b.dimension_0()
+                : ii / ( b.dimension_1() * b.dimension_2() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / b.dimension_0() ) % b.dimension_1()
+                : ( ii / b.dimension_2() ) % b.dimension_1();
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( b.dimension_0() * b.dimension_1() )
+                : ii % b.dimension_2();
+
+    int i0, i1, i2, i3;
+
+    if ( std::is_same< Layout, Kokkos::LayoutLeft >::value ) {
+      i0 = i + offset_0;
+      i1 = j;
+      i2 = k + offset_2;
+      i3 = index;
     }
-  }; 
-
-  template<class Layout, class Space, class MemTraits>
-  struct CheckSubviewCorrectness_3D_3D {
-    Kokkos::View<const int***,Layout,Space,MemTraits> a;
-    Kokkos::View<const int***,Layout,Space,MemTraits> b;
-    int offset_0,offset_2;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        ii % b.dimension_0(): ii / (b.dimension_1()*b.dimension_2());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / b.dimension_0()) % b.dimension_1() : (ii / b.dimension_2()) % b.dimension_1();
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        ii / (b.dimension_0() * b.dimension_1()) : ii % b.dimension_2();
-      if( a(i+offset_0,j,k+offset_2) != b(i,j,k))
-        Kokkos::abort("Error: check_subview_correctness 3D-3D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)");
+    else {
+      i0 = index;
+      i1 = i + offset_0;
+      i2 = j;
+      i3 = k + offset_2;
     }
-  };
-
-  template<class Layout, class Space, class MemTraits>
-  struct CheckSubviewCorrectness_3D_4D {
-    Kokkos::View<const int****,Layout,Space,MemTraits> a;
-    Kokkos::View<const int***,Layout,Space,MemTraits> b;
-    int offset_0,offset_2,index;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        ii % b.dimension_0(): ii / (b.dimension_1()*b.dimension_2());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / b.dimension_0()) % b.dimension_1() : (ii / b.dimension_2()) % b.dimension_1();
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        ii / (b.dimension_0() * b.dimension_1()) : ii % b.dimension_2();
-
-      int i0,i1,i2,i3;
-      if(std::is_same<Layout,Kokkos::LayoutLeft>::value) {
-        i0 = i + offset_0;
-        i1 = j;
-        i2 = k + offset_2;
-        i3 = index;
-      } else {
-        i0 = index;
-        i1 = i + offset_0;
-        i2 = j;
-        i3 = k + offset_2;
-      }
-      if( a(i0,i1,i2,i3) != b(i,j,k))
-        Kokkos::abort("Error: check_subview_correctness 3D-4D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)");
+
+    if ( a( i0, i1, i2, i3 ) != b( i, j, k ) ) {
+      Kokkos::abort( "Error: check_subview_correctness 3D-4D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)" );
     }
-  };
-}
+  }
+};
 
-template<class Space, class MemTraits = void>
+} // namespace Impl
+
+template< class Space, class MemTraits = void >
 void test_layoutleft_to_layoutleft() {
   Impl::test_subview_legal_args_left();
 
   {
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> a("A",100,4,3);
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::ALL);
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 3 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::ALL );
 
-    Impl::FillView_3D<Kokkos::LayoutLeft,Space> fill;
+    Impl::FillView_3D< Kokkos::LayoutLeft, Space > fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)), fill);  
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_3D<Kokkos::LayoutLeft,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutLeft, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 0;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
+
   {
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> a("A",100,4,5);
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::pair<int,int>(1,3));
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 5 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::pair< int, int >( 1, 3 ) );
 
-    Impl::FillView_3D<Kokkos::LayoutLeft,Space> fill;
+    Impl::FillView_3D<Kokkos::LayoutLeft, Space> fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_3D<Kokkos::LayoutLeft,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutLeft, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 1;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
+
   {
-    Kokkos::View<int****,Kokkos::LayoutLeft,Space> a("A",100,4,5,3); 
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::pair<int,int>(1,3),1);
+    Kokkos::View< int****, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 5, 3 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::pair< int, int >( 1, 3 ), 1 );
 
-    Impl::FillView_4D<Kokkos::LayoutLeft,Space> fill;
+    Impl::FillView_4D< Kokkos::LayoutLeft, Space > fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)*a.extent(3)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) * a.extent( 3 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_4D<Kokkos::LayoutLeft,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_4D< Kokkos::LayoutLeft, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 1;
     check.index = 1;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_layoutright_to_layoutright() {
   Impl::test_subview_legal_args_right();
 
   {
-    Kokkos::View<int***,Kokkos::LayoutRight,Space> a("A",100,4,3);
-    Kokkos::View<int***,Kokkos::LayoutRight,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::ALL);
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > a( "A", 100, 4, 3 );
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::ALL );
 
-    Impl::FillView_3D<Kokkos::LayoutRight,Space> fill;
+    Impl::FillView_3D<Kokkos::LayoutRight, Space> fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_3D<Kokkos::LayoutRight,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutRight, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 0;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
-  {
-    Kokkos::View<int****,Kokkos::LayoutRight,Space> a("A",3,4,5,100);
-    Kokkos::View<int***,Kokkos::LayoutRight,Space> b(a,1,Kokkos::pair<int,int>(1,3),Kokkos::ALL,Kokkos::ALL);
 
+  {
+    Kokkos::View< int****, Kokkos::LayoutRight, Space > a( "A", 3, 4, 5, 100 );
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > b( a, 1, Kokkos::pair< int, int >( 1, 3 ), Kokkos::ALL, Kokkos::ALL );
 
-    Impl::FillView_4D<Kokkos::LayoutRight,Space> fill;
+    Impl::FillView_4D< Kokkos::LayoutRight, Space > fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)*a.extent(3)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) * a.extent( 3 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_4D<Kokkos::LayoutRight,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_4D< Kokkos::LayoutRight, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 1;
     check.offset_2 = 0;
     check.index = 1;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
 }
 
-
-}
-//----------------------------------------------------------------------------
-
+} // namespace TestViewSubview
diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp
index f952ab3db51028aff0a0ebfe313b2639e353ab87..4f52fc956707147761dd60354d9cade69b37bb9a 100644
--- a/lib/kokkos/core/unit_test/UnitTestMain.cpp
+++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,15 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <gtest/gtest.h>
 
-int main(int argc, char *argv[]) {
-  ::testing::InitGoogleTest(&argc,argv);
+int main( int argc, char *argv[] ) {
+  ::testing::InitGoogleTest( &argc, argv );
   return RUN_ALL_TESTS();
 }
-
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda.hpp b/lib/kokkos/core/unit_test/cuda/TestCuda.hpp
index 36b9b0688ba239ec2f6bf2b847184e95b07f84a3..768b0392048184a4e26c320f16329c07bb8caba5 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda.hpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda.hpp
@@ -40,31 +40,25 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_CUDA_HPP
 #define KOKKOS_TEST_CUDA_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
-
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
 #include <TestViewSpaceAssign.hpp>
 #include <TestAtomic.hpp>
 #include <TestAtomicOperations.hpp>
-
 #include <TestAtomicViews.hpp>
-
 #include <TestRange.hpp>
 #include <TestTeam.hpp>
 #include <TestReduce.hpp>
@@ -73,20 +67,16 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
 
-// For Some Reason I can only have the definition of SetUp and TearDown in one cpp file ...
+// For some reason I can only have the definition of SetUp and TearDown in one cpp file ...
 class cuda : public ::testing::Test {
 protected:
   static void SetUpTestCase();
@@ -95,17 +85,19 @@ protected:
 
 #ifdef TEST_CUDA_INSTANTIATE_SETUP_TEARDOWN
 void cuda::SetUpTestCase()
-  {
-    Kokkos::Cuda::print_configuration( std::cout );
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-  }
+{
+  Kokkos::print_configuration( std::cout );
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( 0 ) );
+}
 
 void cuda::TearDownTestCase()
-  {
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
-#endif
+{
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
 }
 #endif
+
+} // namespace Test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
index ff379dc805ddcbadcd4e6b135d03beda683d8d5b..7cf19b26d1b3ebe6a73f2614aab51dda9d9bd88c 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
@@ -40,164 +40,164 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , atomics )
+TEST_F( cuda, atomics )
 {
-  const int loop_count = 1e3 ;
+  const int loop_count = 1e3;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Cuda >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Cuda >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Cuda >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Cuda >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Cuda >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Cuda >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Cuda >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Cuda >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Cuda >( 100, 3 ) ) );
 }
 
-TEST_F( cuda , atomic_operations )
+TEST_F( cuda, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 4 ) ) );
   }
 }
 
-TEST_F( cuda , atomic_views_integral )
+TEST_F( cuda, atomic_views_integral )
 {
   const long length = 1000000;
+
   {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 8 ) ) );
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 8 ) ) );
   }
 }
 
-TEST_F( cuda , atomic_views_nonintegral )
+TEST_F( cuda, atomic_views_nonintegral )
 {
   const long length = 1000000;
-  {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 4 ) ) );
 
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 4 ) ) );
   }
 }
 
-
-TEST_F( cuda , atomic_view_api )
+TEST_F( cuda, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::Cuda>();
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Cuda >();
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
index aeaa2a0e81d8114d95fed6566891fecf98d2feb2..e655193a51f513dd390a5545aebe66ebb44f2c11 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
@@ -40,63 +40,68 @@
 // ************************************************************************
 //@HEADER
 */
+
 #define TEST_CUDA_INSTANTIATE_SETUP_TEARDOWN
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , init ) {
+TEST_F( cuda, init )
+{
   ;
 }
 
-TEST_F( cuda , md_range ) {
-  TestMDRange_2D< Kokkos::Cuda >::test_for2(100,100);
-
-  TestMDRange_3D< Kokkos::Cuda >::test_for3(100,100,100);
+TEST_F( cuda , mdrange_for ) {
+  TestMDRange_2D< Kokkos::Cuda >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Cuda >::test_for3( 100, 100, 100 );
+  TestMDRange_4D< Kokkos::Cuda >::test_for4( 100, 10, 100, 10 );
+  TestMDRange_5D< Kokkos::Cuda >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< Kokkos::Cuda >::test_for6( 100, 10, 5, 2, 10, 5 );
 }
 
-TEST_F( cuda, policy_construction) {
+TEST_F( cuda, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::Cuda >();
   TestTeamPolicyConstruction< Kokkos::Cuda >();
 }
 
-TEST_F( cuda , range_tag )
+TEST_F( cuda, range_tag )
 {
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( cuda , compiler_macros )
+TEST_F( cuda, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( cuda , memory_pool )
+TEST_F( cuda, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::Cuda >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -110,24 +115,24 @@ TEST_F( cuda , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-TEST_F( cuda , task_fib )
+TEST_F( cuda, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::Cuda >::run(i, (i+1)*(i+1)*10000 );
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Cuda >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
   }
 }
 
-TEST_F( cuda , task_depend )
+TEST_F( cuda, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::Cuda >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Cuda >::run( i );
   }
 }
 
-TEST_F( cuda , task_team )
+TEST_F( cuda, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::Cuda >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Cuda >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::Cuda >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Cuda >::run( 1000 ); // Put back after testing.
 }
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -135,55 +140,55 @@ TEST_F( cuda , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-TEST_F( cuda , cxx11 )
+TEST_F( cuda, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Cuda >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Cuda >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( cuda, tile_layout )
 {
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::Cuda, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Cuda, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Cuda, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Cuda, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 9, 11 );
 }
 
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-#if defined (KOKKOS_COMPILER_CLANG)
-TEST_F( cuda , dispatch )
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if defined( KOKKOS_COMPILER_CLANG )
+TEST_F( cuda, dispatch )
 {
-  const int repeat = 100 ;
-  for ( int i = 0 ; i < repeat ; ++i ) {
-  for ( int j = 0 ; j < repeat ; ++j ) {
-    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda >(0,j)
-                        , KOKKOS_LAMBDA( int ) {} );
-  }}
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
 }
 #endif
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp
index b9ab9fe72d494a672cefe07f770ea38663e2ffec..01eed4e023447acb953c27ce2e8aa2ab18d155a4 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp
@@ -40,17 +40,17 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , reducers )
+TEST_F( cuda, reducers )
 {
-  TestReducers<int, Kokkos::Cuda>::execute_integer();
-  TestReducers<size_t, Kokkos::Cuda>::execute_integer();
-  TestReducers<double, Kokkos::Cuda>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::Cuda>::execute_basic();
+  TestReducers< int, Kokkos::Cuda >::execute_integer();
+  TestReducers< size_t, Kokkos::Cuda >::execute_integer();
+  TestReducers< double, Kokkos::Cuda >::execute_float();
+  TestReducers< Kokkos::complex<double>, Kokkos::Cuda >::execute_basic();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp
index c588d752dd21ef2135d1e4fa52c37f5dba0c37a9..7f4e0973e7a512a5e855ba30c9e65e5a539c123d 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp
@@ -40,38 +40,44 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, long_reduce) {
-  TestReduce< long ,   Kokkos::Cuda >( 0 );
-  TestReduce< long ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, long_reduce )
+{
+  TestReduce< long, Kokkos::Cuda >( 0 );
+  TestReduce< long, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, double_reduce) {
-  TestReduce< double ,   Kokkos::Cuda >( 0 );
-  TestReduce< double ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, double_reduce )
+{
+  TestReduce< double, Kokkos::Cuda >( 0 );
+  TestReduce< double, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 0 );
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::Cuda >( 0 );
+  TestReduceDynamic< long, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Cuda >( 0 );
-  TestReduceDynamic< double ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::Cuda >( 0 );
+  TestReduceDynamic< double, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::Cuda >( 0 );
+  TestReduceDynamicView< long, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda , scan )
+TEST_F( cuda, scan )
 {
-  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Cuda >::test_range( 1, 1000 );
   TestScan< Kokkos::Cuda >( 0 );
   TestScan< Kokkos::Cuda >( 100000 );
   TestScan< Kokkos::Cuda >( 10000000 );
@@ -79,10 +85,11 @@ TEST_F( cuda , scan )
 }
 
 #if 0
-TEST_F( cuda , scan_small )
+TEST_F( cuda, scan_small )
 {
-  typedef TestScan< Kokkos::Cuda , Kokkos::Impl::CudaExecUseScanSmall > TestScanFunctor ;
-  for ( int i = 0 ; i < 1000 ; ++i ) {
+  typedef TestScan< Kokkos::Cuda, Kokkos::Impl::CudaExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
     TestScanFunctor( 10 );
     TestScanFunctor( 10000 );
   }
@@ -93,38 +100,39 @@ TEST_F( cuda , scan_small )
 }
 #endif
 
-TEST_F( cuda  , team_scan )
+TEST_F( cuda, team_scan )
 {
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( cuda , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( cuda, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( cuda , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( cuda, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( cuda , reduction_deduction )
+TEST_F( cuda, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index f3cbc3b8897625f07f7c4fc810662b68cfe907e9..5bed7640daa114879f789e67807946e0dc2343f4 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -40,6 +40,7 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
@@ -47,353 +48,338 @@ namespace Test {
 __global__
 void test_abort()
 {
-  Kokkos::abort("test_abort");
+  Kokkos::abort( "test_abort" );
 }
 
 __global__
 void test_cuda_spaces_int_value( int * ptr )
 {
-  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
+  if ( *ptr == 42 ) { *ptr = 2 * 42; }
 }
 
-TEST_F( cuda , space_access )
+TEST_F( cuda, space_access )
 {
-  //--------------------------------------
-
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::HostSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    ! Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::HostSpace >::accessible , "" );
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::CudaSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaUVMSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::CudaHostPinnedSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , Kokkos::CudaSpace >::accessible , "" );
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >::accessible , "" );
-
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   static_assert(
     std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaSpace >::Space
-                , Kokkos::HostSpace >::value , "" );
+                , Kokkos::HostSpace >::value, "" );
 
   static_assert(
     std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaUVMSpace >::Space
                 , Kokkos::Device< Kokkos::HostSpace::execution_space
-                                , Kokkos::CudaUVMSpace > >::value , "" );
+                                , Kokkos::CudaUVMSpace > >::value, "" );
 
   static_assert(
     std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaHostPinnedSpace >::Space
-                , Kokkos::CudaHostPinnedSpace >::value , "" );
+                , Kokkos::CudaHostPinnedSpace >::value, "" );
 
   static_assert(
     std::is_same< Kokkos::Device< Kokkos::HostSpace::execution_space
                                 , Kokkos::CudaUVMSpace >
                 , Kokkos::Device< Kokkos::HostSpace::execution_space
-                                , Kokkos::CudaUVMSpace > >::value , "" );
+                                , Kokkos::CudaUVMSpace > >::value, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::Cuda >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::CudaSpace >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::CudaUVMSpace >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::CudaHostPinnedSpace >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 }
 
 TEST_F( cuda, uvm )
 {
   if ( Kokkos::CudaUVMSpace::available() ) {
+    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >( "uvm_ptr", sizeof( int ) );
 
-    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int));
-
-    *uvm_ptr = 42 ;
+    *uvm_ptr = 42;
 
     Kokkos::Cuda::fence();
-    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
+    test_cuda_spaces_int_value<<< 1, 1 >>>( uvm_ptr );
     Kokkos::Cuda::fence();
 
-    EXPECT_EQ( *uvm_ptr, int(2*42) );
-
-    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr );
+    EXPECT_EQ( *uvm_ptr, int( 2 * 42 ) );
 
+    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >( uvm_ptr );
   }
 }
 
 TEST_F( cuda, uvm_num_allocs )
 {
-  // The max number of uvm allocations allowed is 65536
+  // The max number of UVM allocations allowed is 65536.
   #define MAX_NUM_ALLOCS 65536
 
   if ( Kokkos::CudaUVMSpace::available() ) {
-
     struct TestMaxUVMAllocs {
 
-      using view_type         = Kokkos::View< double* , Kokkos::CudaUVMSpace >;
-      using view_of_view_type = Kokkos::View< view_type[ MAX_NUM_ALLOCS ] 
+      using view_type         = Kokkos::View< double*, Kokkos::CudaUVMSpace >;
+      using view_of_view_type = Kokkos::View< view_type[ MAX_NUM_ALLOCS ]
                                             , Kokkos::CudaUVMSpace >;
 
-      TestMaxUVMAllocs()
-      : view_allocs_test("view_allocs_test")
+      TestMaxUVMAllocs() : view_allocs_test( "view_allocs_test" )
       {
+        for ( auto i = 0; i < MAX_NUM_ALLOCS; ++i ) {
 
-        for ( auto i = 0; i < MAX_NUM_ALLOCS ; ++i ) {
-
-          // Kokkos will throw a runtime exception if an attempt is made to 
-          // allocate more than the maximum number of uvm allocations
+          // Kokkos will throw a runtime exception if an attempt is made to
+          // allocate more than the maximum number of uvm allocations.
 
           // In this test, the max num of allocs occurs when i = MAX_NUM_ALLOCS - 1
           // since the 'outer' view counts as one UVM allocation, leaving
-          // 65535 possible UVM allocations, that is 'i in [0 , 65535)'
+          // 65535 possible UVM allocations, that is 'i in [0, 65535)'.
 
-          // The test will catch the exception thrown in this case and continue
+          // The test will catch the exception thrown in this case and continue.
 
-          if ( i == ( MAX_NUM_ALLOCS - 1) ) {
-            EXPECT_ANY_THROW( { view_allocs_test(i) = view_type("inner_view",1); } ) ;
+          if ( i == ( MAX_NUM_ALLOCS - 1 ) ) {
+            EXPECT_ANY_THROW( { view_allocs_test( i ) = view_type( "inner_view", 1 ); } );
           }
           else {
-            if(i<MAX_NUM_ALLOCS - 1000) {
-              EXPECT_NO_THROW( { view_allocs_test(i) = view_type("inner_view",1); } ) ;
-            } else { // This might or might not throw depending on compilation options. 
+            if ( i < MAX_NUM_ALLOCS - 1000 ) {
+              EXPECT_NO_THROW( { view_allocs_test( i ) = view_type( "inner_view", 1 ); } );
+            } else { // This might or might not throw depending on compilation options.
               try {
-                view_allocs_test(i) = view_type("inner_view",1);
+                view_allocs_test( i ) = view_type( "inner_view", 1 );
               }
-              catch (...) {}
+              catch ( ... ) {}
             }
           }
 
-        } //end allocation for loop
+        } // End allocation for loop.
 
-        for ( auto i = 0; i < MAX_NUM_ALLOCS -1; ++i ) {
+        for ( auto i = 0; i < MAX_NUM_ALLOCS - 1; ++i ) {
 
-          view_allocs_test(i) = view_type();
+          view_allocs_test( i ) = view_type();
 
-        } //end deallocation for loop
+        } // End deallocation for loop.
 
-        view_allocs_test = view_of_view_type(); // deallocate the view of views
+        view_allocs_test = view_of_view_type(); // Deallocate the view of views.
       }
 
-      // Member
-      view_of_view_type view_allocs_test ;
-    } ;
-
-    // trigger the test via the TestMaxUVMAllocs constructor
-    TestMaxUVMAllocs() ;
+      // Member.
+      view_of_view_type view_allocs_test;
+    };
 
+    // Trigger the test via the TestMaxUVMAllocs constructor.
+    TestMaxUVMAllocs();
   }
-  #undef MAX_NUM_ALLOCS 
+
+  #undef MAX_NUM_ALLOCS
 }
 
-template< class MemSpace , class ExecSpace >
+template< class MemSpace, class ExecSpace >
 struct TestViewCudaAccessible {
-
   enum { N = 1000 };
 
-  using V = Kokkos::View<double*,MemSpace> ;
+  using V = Kokkos::View< double*, MemSpace >;
 
-  V m_base ;
+  V m_base;
 
   struct TagInit {};
   struct TagTest {};
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_base[i] != i + 1 ) ++error_count ; }
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_base[i] != i + 1 ) ++error_count; }
 
   TestViewCudaAccessible()
-    : m_base("base",N)
+    : m_base( "base", N )
     {}
 
   static void run()
-    {
-      TestViewCudaAccessible self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
-      MemSpace::execution_space::fence();
-      // Next access is a different execution space, must complete prior kernel.
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
+  {
+    TestViewCudaAccessible self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space, TagInit >( 0, N ), self );
+    MemSpace::execution_space::fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
 };
 
-TEST_F( cuda , impl_view_accessible )
+TEST_F( cuda, impl_view_accessible )
 {
-  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaSpace, Kokkos::Cuda >::run();
 
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace, Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace, Kokkos::HostSpace::execution_space >::run();
 
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace::execution_space >::run();
 }
 
 template< class MemSpace >
 struct TestViewCudaTexture {
-
   enum { N = 1000 };
 
-  using V = Kokkos::View<double*,MemSpace> ;
-  using T = Kokkos::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
+  using V = Kokkos::View< double*, MemSpace >;
+  using T = Kokkos::View< const double*, MemSpace, Kokkos::MemoryRandomAccess >;
 
-  V m_base ;
-  T m_tex ;
+  V m_base;
+  T m_tex;
 
   struct TagInit {};
   struct TagTest {};
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_tex[i] != i + 1 ) ++error_count ; }
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_tex[i] != i + 1 ) ++error_count; }
 
   TestViewCudaTexture()
-    : m_base("base",N)
+    : m_base( "base", N )
     , m_tex( m_base )
     {}
 
   static void run()
-    {
-      EXPECT_TRUE( ( std::is_same< typename V::reference_type
-                                 , double &
-                                 >::value ) );
-
-      EXPECT_TRUE( ( std::is_same< typename T::reference_type
-                                 , const double
-                                 >::value ) );
-
-      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
-      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
-
-      TestViewCudaTexture self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
+  {
+    EXPECT_TRUE( ( std::is_same< typename V::reference_type, double & >::value ) );
+    EXPECT_TRUE( ( std::is_same< typename T::reference_type, const double >::value ) );
+
+    EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view.
+    EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value.
 
+    TestViewCudaTexture self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda, TagInit >( 0, N ), self );
 
-TEST_F( cuda , impl_view_texture )
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
+};
+
+TEST_F( cuda, impl_view_texture )
 {
   TestViewCudaTexture< Kokkos::CudaSpace >::run();
   TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
index fd8a647ef3f03b9d1109a464a51cd06e90de703d..0aea35db517bdba78967eb8b443cb771aaf2215f 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
+TEST_F( cuda, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
+TEST_F( cuda, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
+TEST_F( cuda, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_assign_strided ) {
+TEST_F( cuda, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_left_0 ) {
+TEST_F( cuda, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_left_1 ) {
+TEST_F( cuda, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_left_2 ) {
+TEST_F( cuda, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_left_3 ) {
+TEST_F( cuda, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_right_0 ) {
+TEST_F( cuda, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_right_1 ) {
+TEST_F( cuda, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_right_3 ) {
+TEST_F( cuda, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
index 053fcfc2095c26540ff75e545bb4f920e0a96912..f31f4cbe62bc06bd5fee04abc6a71913c6fbddd9 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_layoutleft_to_layoutleft) {
+TEST_F( cuda, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( cuda, view_subview_layoutright_to_layoutright) {
+TEST_F( cuda, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
index 4c5f2ef72fdd45b2b9033d54c3c83e70c3c089c1..0213a196e8612b4d9d3821de6d657803e9e22b6c 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_1d_assign ) {
+TEST_F( cuda, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
index aee6f1730d6fb33e15877a043fe0ef8beaed11d9..181e1bab2ccb531722b08e627a8ee724fcd393d9 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
index 2ef48c686e1d3a202aaf5f017d9ac88cc486085d..708cc1f5ba98fc7eb0f5603524c2b533eb090fee 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
index aec123ac235ef631172b3dc7c26151d2da7e38da..a3db996f8d87d63dd1a21ea74eb83a615a0e7162 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_2d_from_3d ) {
+TEST_F( cuda, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
index e8ad2319963b2750e01d518309e84c7423a387d6..2f7cffa75da133039d0624d2d812053774013846 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
index e86b4513fd8b8fdeb85c7bce130b3ae274d5e214..949c6f3e0b9d3055e7da32ace79a810310861d99 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
index ad9dcc0fd1faccf2c8f8ff5e254b82a33f9d998b..3e68277a9e93b447a90a9b3496e0b4d0ccc407e2 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_left ) {
+TEST_F( cuda, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
index f97d97e59c205fda791ac1d231b1429e1f8d4ec2..0cd91b7795f52f457f4403559cb353180bcdbe44 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
index 2a07f28f830a125d865eb89a4a456cb5d0aa2b62..cd1c13f7d073f1a445c35ded9eaa9fd121d35fee 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
index 3c51d9420184c91d8ddc1b15e9fb50659c1651d6..22d27535431f7b6414c52305a46547654c40ccbb 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_right ) {
+TEST_F( cuda, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
index 835caa7b879891ed4cd0d24bac61bdaf6a686efb..5dc5f87b4e2b7faa2a52163f8b8af732b53000a9 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
index 53bd5eee20205d56ca4356df4f2bb1118e0ff93d..318d8edbbb82eb6dd097b959e07861cf74a77099 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
index e4348319f695da2819e24143754777746bdc35d6..a2158f06c73db10193e1275c5d49c99738b0c06b 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
@@ -1,12 +1,12 @@
-#include<cuda/TestCuda_SubView_c01.cpp>
-#include<cuda/TestCuda_SubView_c02.cpp>
-#include<cuda/TestCuda_SubView_c03.cpp>
-#include<cuda/TestCuda_SubView_c04.cpp>
-#include<cuda/TestCuda_SubView_c05.cpp>
-#include<cuda/TestCuda_SubView_c06.cpp>
-#include<cuda/TestCuda_SubView_c07.cpp>
-#include<cuda/TestCuda_SubView_c08.cpp>
-#include<cuda/TestCuda_SubView_c09.cpp>
-#include<cuda/TestCuda_SubView_c10.cpp>
-#include<cuda/TestCuda_SubView_c11.cpp>
-#include<cuda/TestCuda_SubView_c12.cpp>
+#include <cuda/TestCuda_SubView_c01.cpp>
+#include <cuda/TestCuda_SubView_c02.cpp>
+#include <cuda/TestCuda_SubView_c03.cpp>
+#include <cuda/TestCuda_SubView_c04.cpp>
+#include <cuda/TestCuda_SubView_c05.cpp>
+#include <cuda/TestCuda_SubView_c06.cpp>
+#include <cuda/TestCuda_SubView_c07.cpp>
+#include <cuda/TestCuda_SubView_c08.cpp>
+#include <cuda/TestCuda_SubView_c09.cpp>
+#include <cuda/TestCuda_SubView_c10.cpp>
+#include <cuda/TestCuda_SubView_c11.cpp>
+#include <cuda/TestCuda_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
index 13834d09ad03854d1ac1ae17c7e8a159efa55ca7..8d9b9328ba9691fe90947554aeb9e9825322d55a 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
@@ -40,81 +40,87 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , team_tag )
+TEST_F( cuda, team_tag )
 {
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
 
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( cuda , team_shared_request) {
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( cuda, team_shared_request )
+{
+  TestSharedTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-//THis Tests request to much L0 scratch
-//TEST_F( cuda, team_scratch_request) {
-//  TestScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-//  TestScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+// This tests request to much L0 scratch.
+//TEST_F( cuda, team_scratch_request )
+//{
+//  TestScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+//  TestScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 //}
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( cuda , team_lambda_shared_request) {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( cuda, team_lambda_shared_request )
+{
   TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static>  >();
+  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
   TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic>  >();
+  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( cuda, shmem_size) {
+TEST_F( cuda, shmem_size )
+{
   TestShmemSize< Kokkos::Cuda >();
 }
 
-TEST_F( cuda, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( cuda, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( cuda , team_vector )
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND) && !defined(KOKKOS_ARCH_PASCAL)
+TEST_F( cuda, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 10 ) ) );
 }
+#endif
 
 TEST_F( cuda, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048, 16, 16 );
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp
index c01ca1c1463c6573c8d9e51c0ca31ed43c19941e..be0c4c5715eeba492112e9a83dbc3cba09796d98 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp
@@ -40,20 +40,21 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_view_mapping_a ) {
+TEST_F( cuda, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::CudaSpace >();
   test_view_mapping_operator< Kokkos::CudaSpace >();
 }
 
-TEST_F( cuda , view_of_class )
+TEST_F( cuda, view_of_class )
 {
   TestViewMappingClassValue< Kokkos::CudaSpace >::run();
   TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
index 8e821ada000678c762b22db574dd1e0d816bbd54..b4d8e5d953f8e753eac945560fac763589bd2025 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_view_mapping_d ) {
+TEST_F( cuda, impl_view_mapping_d )
+{
   test_view_mapping< Kokkos::CudaHostPinnedSpace >();
   test_view_mapping_operator< Kokkos::CudaHostPinnedSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp
index cf29a68e96586dc5d194bd0b28338259784dceb0..e4e6894c5346b6283371903bc2e1bdea18c5f399 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_view_mapping_c ) {
+TEST_F( cuda, impl_view_mapping_c )
+{
   test_view_mapping< Kokkos::CudaUVMSpace >();
   test_view_mapping_operator< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp
index db14b5158f6efa01a6397df98041827a830158d4..82a3dd83e88c3b047525771a5dd9deca32d6d891 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp
@@ -40,73 +40,77 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , view_nested_view )
+TEST_F( cuda, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Cuda >();
 }
 
-
-
-TEST_F( cuda , view_remap )
+TEST_F( cuda, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
 
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::CudaUVMSpace > output_type ;
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::CudaUVMSpace > output_type;
 
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::CudaUVMSpace > input_type ;
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::CudaUVMSpace > input_type;
 
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::CudaUVMSpace > diff_type ;
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::CudaUVMSpace > diff_type;
 
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
 
   Kokkos::fence();
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
   Kokkos::fence();
 
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
- 
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
   Kokkos::fence();
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
+
   Kokkos::fence();
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , view_aggregate )
+TEST_F( cuda, view_aggregate )
 {
   TestViewAggregate< Kokkos::Cuda >();
 }
 
-TEST_F( cuda , template_meta_functions )
+TEST_F( cuda, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
+  TestTemplateMetaFunctions< int, Kokkos::Cuda >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp
index 07d425647330228815a7103e6f7596a8a2f2a460..27450fa6ff827dbbe6970331eca68589a423c406 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp
@@ -40,17 +40,20 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
+TEST_F( cuda, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::CudaSpace, Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaUVMSpace, Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace::execution_space >();
 }
 
-TEST_F( cuda , impl_view_mapping_b ) {
+TEST_F( cuda, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::CudaSpace >();
   test_view_mapping_subview< Kokkos::CudaUVMSpace >();
   test_view_mapping_subview< Kokkos::CudaHostPinnedSpace >();
@@ -59,5 +62,4 @@ TEST_F( cuda , impl_view_mapping_b ) {
   TestViewMappingAtomic< Kokkos::CudaHostPinnedSpace >::run();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp
index 34721f02dc73f418ba7c348fe65c3a59d534dc7c..56524111aec939d0ff2b80196b5352a44f6919dd 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp
@@ -40,16 +40,17 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_api_a) {
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
+TEST_F( cuda, view_api_a )
+{
+  typedef Kokkos::View< const int *, Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> > view_texture_managed;
+  typedef Kokkos::View< const int *, Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess | Kokkos::Unmanaged> > view_texture_unmanaged;
 
-  TestViewAPI< double , Kokkos::Cuda >();
+  TestViewAPI< double, Kokkos::Cuda >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp
index abbcf3bf8bfa6d89ff5c5a5891d8cd16018becf0..d5fd24456d782409450fcf949d6c6280504bb785 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_api_b) {
-  TestViewAPI< double , Kokkos::CudaUVMSpace >();
+TEST_F( cuda, view_api_b )
+{
+  TestViewAPI< double, Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp
index 9899642035ada183fe7b7b5c4a60610e3c271739..649023e4afcaf921511edab82cc10035776246ae 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_api_c) {
-  TestViewAPI< double , Kokkos::CudaHostPinnedSpace >();
+TEST_F( cuda, view_api_c )
+{
+  TestViewAPI< double, Kokkos::CudaHostPinnedSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp
index 9bc09ba893affeec45923883b62751534a7e86dc..b46b1e5f8173bd724c0333de776366704c23f152 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , view_space_assign ) {
-  view_space_assign< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >();
-  view_space_assign< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >();
+TEST_F( cuda, view_space_assign )
+{
+  view_space_assign< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >();
+  view_space_assign< Kokkos::CudaSpace, Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
index 28ae5b41b039a385db047de37c5a0d1865a1ee1b..ed9bb68cd60a004c214ec473ae35653f61c6a814 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
@@ -40,11 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_OPENMP_HPP
 #define KOKKOS_TEST_OPENMP_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
+
 #ifdef KOKKOS_LAMBDA
 #undef KOKKOS_LAMBDA
 #endif
@@ -53,13 +56,8 @@
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
@@ -74,15 +72,11 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
@@ -95,23 +89,24 @@ protected:
     const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
     const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
 
-    const unsigned threads_count = std::max( 1u , numa_count ) *
-                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+    const unsigned threads_count = std::max( 1u, numa_count ) *
+                                   std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 );
 
     Kokkos::OpenMP::initialize( threads_count );
-    Kokkos::OpenMP::print_configuration( std::cout , true );
-    srand(10231);
+    Kokkos::print_configuration( std::cout, true );
+    srand( 10231 );
   }
 
   static void TearDownTestCase()
   {
     Kokkos::OpenMP::finalize();
 
-    omp_set_num_threads(1);
+    omp_set_num_threads( 1 );
 
-    ASSERT_EQ( 1 , omp_get_max_threads() );
+    ASSERT_EQ( 1, omp_get_max_threads() );
   }
 };
 
-}
+} // namespace Test
+
 #endif
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
index ed6c9f8d1696c9c653c82f52b14a8a73520b7735..2585c01973b3aeba5fd00f27068c361b15552800 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
@@ -40,165 +40,162 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , atomics )
+TEST_F( openmp, atomics )
 {
-  const int loop_count = 1e4 ;
+  const int loop_count = 1e4;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::OpenMP >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::OpenMP >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::OpenMP >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::OpenMP >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::OpenMP >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::OpenMP >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::OpenMP >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::OpenMP >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::OpenMP >( 100, 3 ) ) );
 }
 
-TEST_F( openmp , atomic_operations )
+TEST_F( openmp, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 4 ) ) );
   }
-
 }
 
-
-TEST_F( openmp , atomic_views_integral )
+TEST_F( openmp, atomic_views_integral )
 {
   const long length = 1000000;
   {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 8 ) ) );
-
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 8 ) ) );
   }
 }
 
-TEST_F( openmp , atomic_views_nonintegral )
+TEST_F( openmp, atomic_views_nonintegral )
 {
   const long length = 1000000;
   {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 4 ) ) );
-
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 4 ) ) );
   }
 }
 
-TEST_F( openmp , atomic_view_api )
+TEST_F( openmp, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::OpenMP>();
+  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
index 126d730f0ff96272ae1e21eb5f8f81523fda8f02..b4f32dac706222e2c1f79f43469eadb4f5e3e6c6 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
@@ -40,65 +40,90 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , init ) {
+TEST_F( openmp, init )
+{
   ;
 }
 
-TEST_F( openmp , md_range ) {
-  TestMDRange_2D< Kokkos::OpenMP >::test_for2(100,100);
+TEST_F( openmp, mdrange_for )
+{
+  Kokkos::Timer timer;
+  TestMDRange_2D< Kokkos::OpenMP >::test_for2( 10000, 1000 );
+  std::cout << " 2D: " << timer.seconds() << std::endl;
+
+  timer.reset();
+  TestMDRange_3D< Kokkos::OpenMP >::test_for3( 100, 100, 1000 );
+  std::cout << " 3D: " << timer.seconds() << std::endl;
 
-  TestMDRange_3D< Kokkos::OpenMP >::test_for3(100,100,100);
+  timer.reset();
+  TestMDRange_4D< Kokkos::OpenMP >::test_for4( 100, 10, 100, 100 );
+  std::cout << " 4D: " << timer.seconds() << std::endl;
+
+  timer.reset();
+  TestMDRange_5D< Kokkos::OpenMP >::test_for5( 100, 10, 10, 100, 50 );
+  std::cout << " 5D: " << timer.seconds() << std::endl;
+
+  timer.reset();
+  TestMDRange_6D< Kokkos::OpenMP >::test_for6( 10, 10, 10, 10, 50, 50 );
+  std::cout << " 6D: " << timer.seconds() << std::endl;
 }
 
-TEST_F( openmp, policy_construction) {
+TEST_F( openmp, mdrange_reduce )
+{
+  TestMDRange_2D< Kokkos::OpenMP >::test_reduce2( 100, 100 );
+  TestMDRange_3D< Kokkos::OpenMP >::test_reduce3( 100, 10, 100 );
+}
+
+TEST_F( openmp, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::OpenMP >();
   TestTeamPolicyConstruction< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , range_tag )
+TEST_F( openmp, range_tag )
 {
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(0);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(3);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( openmp , compiler_macros )
+TEST_F( openmp, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::OpenMP >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( openmp , memory_pool )
+TEST_F( openmp, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::OpenMP >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -112,24 +137,24 @@ TEST_F( openmp , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-TEST_F( openmp , task_fib )
+TEST_F( openmp, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::OpenMP >::run(i, (i+1)*(i+1)*10000 );
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::OpenMP >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
   }
 }
 
-TEST_F( openmp , task_depend )
+TEST_F( openmp, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::OpenMP >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::OpenMP >::run( i );
   }
 }
 
-TEST_F( openmp , task_team )
+TEST_F( openmp, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::OpenMP >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::OpenMP >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::OpenMP >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::OpenMP >::run( 1000 ); // Put back after testing.
 }
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -137,53 +162,51 @@ TEST_F( openmp , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-TEST_F( openmp , cxx11 )
+TEST_F( openmp, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::OpenMP >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( openmp, tile_layout )
 {
-  TestTile::test< Kokkos::OpenMP , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::OpenMP , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::OpenMP , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::OpenMP , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::OpenMP, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::OpenMP, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::OpenMP, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::OpenMP, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 9, 11 );
 }
 
-
-TEST_F( openmp , dispatch )
+TEST_F( openmp, dispatch )
 {
-  const int repeat = 100 ;
-  for ( int i = 0 ; i < repeat ; ++i ) {
-  for ( int j = 0 ; j < repeat ; ++j ) {
-    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::OpenMP >(0,j)
-                        , KOKKOS_LAMBDA( int ) {} );
-  }}
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::OpenMP >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
index d41e1493eea6306d68087d1a8562ab963e1ec039..22c29308a6289361bfa0b62d47e579e4bb1e29c2 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
@@ -40,46 +40,52 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, long_reduce) {
-  TestReduce< long ,   Kokkos::OpenMP >( 0 );
-  TestReduce< long ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, long_reduce )
+{
+  TestReduce< long, Kokkos::OpenMP >( 0 );
+  TestReduce< long, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp, double_reduce) {
-  TestReduce< double ,   Kokkos::OpenMP >( 0 );
-  TestReduce< double ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, double_reduce )
+{
+  TestReduce< double, Kokkos::OpenMP >( 0 );
+  TestReduce< double, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp , reducers )
+TEST_F( openmp, reducers )
 {
-  TestReducers<int, Kokkos::OpenMP>::execute_integer();
-  TestReducers<size_t, Kokkos::OpenMP>::execute_integer();
-  TestReducers<double, Kokkos::OpenMP>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::OpenMP>::execute_basic();
+  TestReducers< int, Kokkos::OpenMP >::execute_integer();
+  TestReducers< size_t, Kokkos::OpenMP >::execute_integer();
+  TestReducers< double, Kokkos::OpenMP >::execute_float();
+  TestReducers< Kokkos::complex<double>, Kokkos::OpenMP >::execute_basic();
 }
 
-TEST_F( openmp, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::OpenMP >( 0 );
-  TestReduceDynamic< long ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::OpenMP >( 0 );
+  TestReduceDynamic< long, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::OpenMP >( 0 );
-  TestReduceDynamic< double ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::OpenMP >( 0 );
+  TestReduceDynamic< double, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::OpenMP >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::OpenMP >( 0 );
+  TestReduceDynamicView< long, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp , scan )
+TEST_F( openmp, scan )
 {
-  TestScan< Kokkos::OpenMP >::test_range( 1 , 1000 );
+  TestScan< Kokkos::OpenMP >::test_range( 1, 1000 );
   TestScan< Kokkos::OpenMP >( 0 );
   TestScan< Kokkos::OpenMP >( 100000 );
   TestScan< Kokkos::OpenMP >( 10000000 );
@@ -87,10 +93,11 @@ TEST_F( openmp , scan )
 }
 
 #if 0
-TEST_F( openmp , scan_small )
+TEST_F( openmp, scan_small )
 {
-  typedef TestScan< Kokkos::OpenMP , Kokkos::Impl::OpenMPExecUseScanSmall > TestScanFunctor ;
-  for ( int i = 0 ; i < 1000 ; ++i ) {
+  typedef TestScan< Kokkos::OpenMP, Kokkos::Impl::OpenMPExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
     TestScanFunctor( 10 );
     TestScanFunctor( 10000 );
   }
@@ -101,38 +108,39 @@ TEST_F( openmp , scan_small )
 }
 #endif
 
-TEST_F( openmp  , team_scan )
+TEST_F( openmp, team_scan )
 {
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( openmp , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( openmp, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( openmp , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( openmp, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( openmp , reduction_deduction )
+TEST_F( openmp, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
index 9854417e42da5a8bdd6986b85fbdd754bab3e57b..fefae073227a7086bb440152b76abf16dc9c00b2 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::OpenMP >();
+TEST_F( openmp, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::OpenMP >();
+TEST_F( openmp, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::OpenMP >();
+TEST_F( openmp, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_assign_strided ) {
+TEST_F( openmp, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_0 ) {
+TEST_F( openmp, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_1 ) {
+TEST_F( openmp, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_2 ) {
+TEST_F( openmp, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_3 ) {
+TEST_F( openmp, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_right_0 ) {
+TEST_F( openmp, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_right_1 ) {
+TEST_F( openmp, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_right_3 ) {
+TEST_F( openmp, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
index 2aa1fc5c633ffab0319c37c7a00a9abe48438597..7de7ca91bdc082057bccc1b71ec8f482a16bc0f9 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_layoutleft_to_layoutleft) {
+TEST_F( openmp, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( openmp, view_subview_layoutright_to_layoutright) {
+TEST_F( openmp, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
index 1a6871cfca8f3136b13011f66576cd7a9d891978..d727ec0ee592c57d357b8cfebfa83a9bcc06eb12 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_1d_assign ) {
+TEST_F( openmp, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
index b04edbb997d564a2e921bacf7b36959b17e8755f..df43f555d385037dafe3a29b9cec66ef2eb9b781 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
index 765e235830db2f7e48ad8fe9df271429fef2c2ab..38f241ebf7bdea50af2f8a0b06dd69b16175667c 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
index 9d8b62708a3d4d898ddbc923b733c78c869c2826..11a4ea8ac24bf457f9d4fbe97b5180536d1fac69 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_2d_from_3d ) {
+TEST_F( openmp, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
index 9c19cf0e57dcf7058f4f0aeb4752465c470e9fa9..a91baa34df3f0fc41db37909fdcdbeefc27a3158 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
index c1bdf72351b02958f5e1e857c41f7e5d999ade64..20d4d9bd64462eaa9d90a5d776c7129a7a816312 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
index 08a3b5a54a2c66599ebc61384357324a79815507..528df1c0700d7582f427310d8f7610376f9166bb 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_left ) {
+TEST_F( openmp, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
index 0864ebbdaa44b1bd00a154fe2f7fcf4b55ae48eb..d9eea8dba91a7c03cdfd8460b2241438ffbbce1d 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
index e38dfecbf6e353bcab69f7341d2754ea6ef85cf9..f909dc33c067ca4ff6c3badeddf92c6bb12a2bd6 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
index b7e4683d23d18bb838c97a1fa198b2d38874de77..59996d5e33b594a23c7e368354208c68707339e9 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_right ) {
+TEST_F( openmp, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
index fc3e66fd4853c6104503aaf461eda97183cb44e1..3f9c215d9b10dbbeb3aada555515ab27c1e38adb 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
index e21a13ee579e5052241252ffa6b99ba49f9c6b47..d3a73483a0bc11c4d60eb4d6d658c00fde838566 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
index 9da159ab5773a0a7b1a49605cf1a88294a29d09d..399c6e92e4c7cf858ecef02a97e1bf4742ec6eda 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
@@ -1,12 +1,12 @@
-#include<openmp/TestOpenMP_SubView_c01.cpp>
-#include<openmp/TestOpenMP_SubView_c02.cpp>
-#include<openmp/TestOpenMP_SubView_c03.cpp>
-#include<openmp/TestOpenMP_SubView_c04.cpp>
-#include<openmp/TestOpenMP_SubView_c05.cpp>
-#include<openmp/TestOpenMP_SubView_c06.cpp>
-#include<openmp/TestOpenMP_SubView_c07.cpp>
-#include<openmp/TestOpenMP_SubView_c08.cpp>
-#include<openmp/TestOpenMP_SubView_c09.cpp>
-#include<openmp/TestOpenMP_SubView_c10.cpp>
-#include<openmp/TestOpenMP_SubView_c11.cpp>
-#include<openmp/TestOpenMP_SubView_c12.cpp>
+#include <openmp/TestOpenMP_SubView_c01.cpp>
+#include <openmp/TestOpenMP_SubView_c02.cpp>
+#include <openmp/TestOpenMP_SubView_c03.cpp>
+#include <openmp/TestOpenMP_SubView_c04.cpp>
+#include <openmp/TestOpenMP_SubView_c05.cpp>
+#include <openmp/TestOpenMP_SubView_c06.cpp>
+#include <openmp/TestOpenMP_SubView_c07.cpp>
+#include <openmp/TestOpenMP_SubView_c08.cpp>
+#include <openmp/TestOpenMP_SubView_c09.cpp>
+#include <openmp/TestOpenMP_SubView_c10.cpp>
+#include <openmp/TestOpenMP_SubView_c11.cpp>
+#include <openmp/TestOpenMP_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
index 38cf0a0f409c8dbe5d923cae4b88bec619a5a8b0..216789e8bf6ebcd1d2deab1e567317376c611e0b 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
@@ -40,67 +40,73 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , team_tag )
+TEST_F( openmp, team_tag )
 {
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
 
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( openmp , team_shared_request) {
-  TestSharedTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( openmp, team_shared_request )
+{
+  TestSharedTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( openmp, team_scratch_request) {
-  TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( openmp, team_scratch_request )
+{
+  TestScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( openmp , team_lambda_shared_request) {
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( openmp, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( openmp, shmem_size) {
+TEST_F( openmp, shmem_size )
+{
   TestShmemSize< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( openmp, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( openmp , team_vector )
+TEST_F( openmp, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 10 ) ) );
 }
 
 #ifdef KOKKOS_COMPILER_GNU
@@ -112,11 +118,10 @@ TEST_F( openmp , team_vector )
 #ifndef SKIP_TEST
 TEST_F( openmp, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048, 16, 16 );
 }
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp
index 82cbf3ea18ecf7c3c424c73fe3e41ebf4a4e0c26..aead381a11e5b5a88763d9622deac55c3ceaf631 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , impl_view_mapping_a ) {
+TEST_F( openmp, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::OpenMP >();
   test_view_mapping_operator< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
index b2d4f87fdd417ab2d1036884dcce4b0df5793396..c802fb79caf081b103c6e65bf54d8e20fe3b7193 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
@@ -40,82 +40,85 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >();
+TEST_F( openmp, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , impl_view_mapping_b ) {
+TEST_F( openmp, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::OpenMP >();
   TestViewMappingAtomic< Kokkos::OpenMP >::run();
 }
 
-TEST_F( openmp, view_api) {
-  TestViewAPI< double , Kokkos::OpenMP >();
+TEST_F( openmp, view_api )
+{
+  TestViewAPI< double, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , view_nested_view )
+TEST_F( openmp, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::OpenMP >();
 }
 
-
-
-TEST_F( openmp , view_remap )
+TEST_F( openmp, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::OpenMP > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::OpenMP > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::OpenMP > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::OpenMP > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::OpenMP > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::OpenMP > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( openmp , view_aggregate )
+TEST_F( openmp, view_aggregate )
 {
   TestViewAggregate< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , template_meta_functions )
+TEST_F( openmp, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::OpenMP >();
+  TestTemplateMetaFunctions< int, Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads.hpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..907fe23ea5e7c6b11a52c6327787ddee0108f89e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_QTHREADS_HPP
+#define KOKKOS_TEST_QTHREADS_HPP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <TestTile.hpp>
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+#include <TestAtomicViews.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskScheduler.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestPolicyConstruction.hpp>
+#include <TestMDRange.hpp>
+
+namespace Test {
+
+class qthreads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    const unsigned threads_count = std::max( 1u, numa_count ) *
+                                   std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::Qthreads::initialize( threads_count );
+    Kokkos::print_configuration( std::cout, true );
+
+    srand( 10231 );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Qthreads::finalize();
+  }
+};
+
+} // namespace Test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e64c3305db616b09c24c2b47d64c9153e3aeb0df
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, atomics )
+{
+#if 0
+  const int loop_count = 1e4;
+
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 3 ) ) );
+#endif
+}
+
+TEST_F( qthreads, atomic_operations )
+{
+#if 0
+  const int start = 1; // Avoid zero for division.
+  const int end = 11;
+
+  for ( int i = start; i < end; ++i )
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_views_integral )
+{
+#if 0
+  const long length = 1000000;
+
+  {
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 8 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_views_nonintegral )
+{
+#if 0
+  const long length = 1000000;
+
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 4 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_view_api )
+{
+#if 0
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0faec84056997dd0d1236ff8c00f2218b2549cf9
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, init )
+{
+  ;
+}
+
+TEST_F( qthreads, md_range )
+{
+#if 0
+  TestMDRange_2D< Kokkos::Qthreads >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Qthreads >::test_for3( 100, 100, 100 );
+#endif
+}
+
+TEST_F( qthreads, policy_construction )
+{
+#if 0
+  TestRangePolicyConstruction< Kokkos::Qthreads >();
+  TestTeamPolicyConstruction< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, range_tag )
+{
+#if 0
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthreads, compiler_macros )
+{
+#if 0
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthreads >() ) );
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthreads, memory_pool )
+{
+#if 0
+  bool val = TestMemoryPool::test_mempool< Kokkos::Qthreads >( 128, 128000000 );
+  ASSERT_TRUE( val );
+
+  TestMemoryPool::test_mempool2< Kokkos::Qthreads >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Qthreads >();
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+TEST_F( qthreads, task_fib )
+{
+#if 0
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Qthreads >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
+  }
+#endif
+}
+
+TEST_F( qthreads, task_depend )
+{
+#if 0
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Qthreads >::run( i );
+  }
+#endif
+}
+
+TEST_F( qthreads, task_team )
+{
+#if 0
+  TestTaskScheduler::TestTaskTeam< Kokkos::Qthreads >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Qthreads >::run( 1000 ); // Put back after testing.
+#endif
+}
+
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+
+TEST_F( qthreads, cxx11 )
+{
+#if 0
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Qthreads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 4 ) ) );
+  }
+#endif
+}
+
+#endif
+
+TEST_F( qthreads, tile_layout )
+{
+#if 0
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Qthreads, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 9, 11 );
+#endif
+}
+
+TEST_F( qthreads, dispatch )
+{
+#if 0
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Qthreads >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2470ac15c45431e852981a94f792bb2710535d7
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp
@@ -0,0 +1,168 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, long_reduce )
+{
+#if 0
+  TestReduce< long, Kokkos::Qthreads >( 0 );
+  TestReduce< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, double_reduce )
+{
+#if 0
+  TestReduce< double, Kokkos::Qthreads >( 0 );
+  TestReduce< double, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, reducers )
+{
+#if 0
+  TestReducers< int, Kokkos::Qthreads >::execute_integer();
+  TestReducers< size_t, Kokkos::Qthreads >::execute_integer();
+  TestReducers< double, Kokkos::Qthreads >::execute_float();
+  TestReducers< Kokkos::complex<double >, Kokkos::Qthreads>::execute_basic();
+#endif
+}
+
+TEST_F( qthreads, long_reduce_dynamic )
+{
+#if 0
+  TestReduceDynamic< long, Kokkos::Qthreads >( 0 );
+  TestReduceDynamic< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, double_reduce_dynamic )
+{
+#if 0
+  TestReduceDynamic< double, Kokkos::Qthreads >( 0 );
+  TestReduceDynamic< double, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, long_reduce_dynamic_view )
+{
+#if 0
+  TestReduceDynamicView< long, Kokkos::Qthreads >( 0 );
+  TestReduceDynamicView< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, scan )
+{
+#if 0
+  TestScan< Kokkos::Qthreads >::test_range( 1, 1000 );
+  TestScan< Kokkos::Qthreads >( 0 );
+  TestScan< Kokkos::Qthreads >( 100000 );
+  TestScan< Kokkos::Qthreads >( 10000000 );
+  Kokkos::Qthreads::fence();
+#endif
+}
+
+TEST_F( qthreads, scan_small )
+{
+#if 0
+  typedef TestScan< Kokkos::Qthreads, Kokkos::Impl::QthreadsExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
+    TestScanFunctor( 10 );
+    TestScanFunctor( 10000 );
+  }
+  TestScanFunctor( 1000000 );
+  TestScanFunctor( 10000000 );
+
+  Kokkos::Qthreads::fence();
+#endif
+}
+
+TEST_F( qthreads, team_scan )
+{
+#if 0
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+#endif
+}
+
+TEST_F( qthreads, team_long_reduce )
+{
+#if 0
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+#endif
+}
+
+TEST_F( qthreads, team_double_reduce )
+{
+#if 0
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+#endif
+}
+
+TEST_F( qthreads, reduction_deduction )
+{
+#if 0
+  TestCXX11::test_reduction_deduction< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab873359a748e6086533454f7a0842a5e8dee9e6
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_auto_1d_left )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_auto_1d_right )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_auto_1d_stride )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_assign_strided )
+{
+#if 0
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_0 )
+{
+#if 0
+  TestViewSubview::test_left_0< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_1 )
+{
+#if 0
+  TestViewSubview::test_left_1< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_2 )
+{
+#if 0
+  TestViewSubview::test_left_2< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_3 )
+{
+#if 0
+  TestViewSubview::test_left_3< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_0 )
+{
+#if 0
+  TestViewSubview::test_right_0< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_1 )
+{
+#if 0
+  TestViewSubview::test_right_1< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_3 )
+{
+#if 0
+  TestViewSubview::test_right_3< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199c5c795557bb4da254c24d320a99240768e014
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp
@@ -0,0 +1,66 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_layoutleft_to_layoutleft )
+{
+#if 0
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_layoutright_to_layoutright )
+{
+#if 0
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f44909f3daffd71b13a12eba33b4e8e142e946ad
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bb936f8dd511034924d779362f34e10833b2668
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign_atomic )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27073dfa814683a77a0edc602e23f3c3aadcd0e2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign_randomaccess )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b3cf488521b6ed84aa7eda62084ba737d485abf
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34dda63e64da0cb39b1a7d977ff08477aa8bbfec
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d_atomic )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a4ee50fb2f6b41ddfc504192a3815d4a1775f5e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d_randomaccess )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe386e34a8083a8bc2084b6957f57124a78d41c3
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3e0ab25291334f291adf3ba743c822eea552380
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left_atomic )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df1f570e9dce927b75c11695a11124564e39d567
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left_randomaccess )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc3c80d10d7b3fd544ed7b49fa56b9f2f4e8b5a7
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..14b331a4585efeb912c0ec7001cf0195657c60de
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right_atomic )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..571382e66f52d5a6c8294af1d117ebaeb6fe25f5
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right_randomaccess )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab984c5f30e05958c0c601256ada3c13a70ee68d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
@@ -0,0 +1,12 @@
+#include <qthreads/TestQthreads_SubView_c01.cpp>
+#include <qthreads/TestQthreads_SubView_c02.cpp>
+#include <qthreads/TestQthreads_SubView_c03.cpp>
+#include <qthreads/TestQthreads_SubView_c04.cpp>
+#include <qthreads/TestQthreads_SubView_c05.cpp>
+#include <qthreads/TestQthreads_SubView_c06.cpp>
+#include <qthreads/TestQthreads_SubView_c07.cpp>
+#include <qthreads/TestQthreads_SubView_c08.cpp>
+#include <qthreads/TestQthreads_SubView_c09.cpp>
+#include <qthreads/TestQthreads_SubView_c10.cpp>
+#include <qthreads/TestQthreads_SubView_c11.cpp>
+#include <qthreads/TestQthreads_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7b81283fbf27e97427defbf1b0894793cc44ed2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp
@@ -0,0 +1,143 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, team_tag )
+{
+#if 0
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+#endif
+}
+
+TEST_F( qthreads, team_shared_request )
+{
+#if 0
+  TestSharedTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+TEST_F( qthreads, team_scratch_request )
+{
+#if 0
+  TestScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( qthreads, team_lambda_shared_request )
+{
+#if 0
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+#endif
+
+TEST_F( qthreads, shmem_size )
+{
+#if 0
+  TestShmemSize< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, multi_level_scratch )
+{
+#if 0
+  TestMultiLevelScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+TEST_F( qthreads, team_vector )
+{
+#if 0
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 10 ) ) );
+#endif
+}
+
+#ifdef KOKKOS_COMPILER_GNU
+#if ( KOKKOS_COMPILER_GNU == 472 )
+#define SKIP_TEST
+#endif
+#endif
+
+#ifndef SKIP_TEST
+TEST_F( qthreads, triple_nested_parallelism )
+{
+#if 0
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 16, 16 );
+#endif
+}
+#endif
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd876a36bfa457f3c5f895d604f38be27fa4e986
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, impl_view_mapping_a )
+{
+#if 0
+  test_view_mapping< Kokkos::Qthreads >();
+  test_view_mapping_operator< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adf048b61360b1aa9d49d9ce0f93453d580eb1a4
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, impl_shared_alloc )
+{
+#if 0
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, impl_view_mapping_b )
+{
+#if 0
+  test_view_mapping_subview< Kokkos::Qthreads >();
+  TestViewMappingAtomic< Kokkos::Qthreads >::run();
+#endif
+}
+
+TEST_F( qthreads, view_api )
+{
+#if 0
+  TestViewAPI< double, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_nested_view )
+{
+#if 0
+  ::Test::view_nested_view< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_remap )
+{
+#if 0
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Qthreads > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Qthreads > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Qthreads > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, view_aggregate )
+{
+#if 0
+  TestViewAggregate< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, template_meta_functions )
+{
+#if 0
+  TestTemplateMetaFunctions< int, Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial.hpp b/lib/kokkos/core/unit_test/serial/TestSerial.hpp
index c0ffa6afb1843f7fe61693a778d9389e4c20fccb..03da07e065e371e636f1d2c59ba99a2832dd574c 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial.hpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial.hpp
@@ -40,11 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_SERIAL_HPP
 #define KOKKOS_TEST_SERIAL_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
+
 #ifdef KOKKOS_LAMBDA
 #undef KOKKOS_LAMBDA
 #endif
@@ -53,21 +56,14 @@
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
 #include <TestAtomic.hpp>
 #include <TestAtomicOperations.hpp>
-
 #include <TestAtomicViews.hpp>
-
 #include <TestRange.hpp>
 #include <TestTeam.hpp>
 #include <TestReduce.hpp>
@@ -76,15 +72,11 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
@@ -92,14 +84,16 @@ namespace Test {
 class serial : public ::testing::Test {
 protected:
   static void SetUpTestCase()
-    {
-      Kokkos::HostSpace::execution_space::initialize();
-    }
+  {
+    Kokkos::HostSpace::execution_space::initialize();
+  }
+
   static void TearDownTestCase()
-    {
-      Kokkos::HostSpace::execution_space::finalize();
-    }
+  {
+    Kokkos::HostSpace::execution_space::finalize();
+  }
 };
 
-}
+} // namespace Test
+
 #endif
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
index 729a76556dc4f3ff8110ba62b02dfc57ec878590..81ba532a3d45322ca561498585763d413256be3c 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
@@ -40,165 +40,165 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , atomics )
+TEST_F( serial, atomics )
 {
-  const int loop_count = 1e6 ;
+  const int loop_count = 1e6;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Serial >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Serial >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Serial >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Serial >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Serial >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Serial >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Serial >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Serial >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Serial >( 100, 3 ) ) );
 }
 
-TEST_F( serial , atomic_operations )
+TEST_F( serial, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 12) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 4 ) ) );
   }
-
 }
 
 
-TEST_F( serial , atomic_views_integral )
+TEST_F( serial, atomic_views_integral )
 {
   const long length = 1000000;
-  {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 8 ) ) );
 
+  {
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 8 ) ) );
   }
 }
 
-TEST_F( serial , atomic_views_nonintegral )
+TEST_F( serial, atomic_views_nonintegral )
 {
   const long length = 1000000;
-  {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 4 ) ) );
 
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 4 ) ) );
   }
 }
 
-TEST_F( serial , atomic_view_api )
+TEST_F( serial, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::Serial>();
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
index 43fc4c358745f3f01032723d029796a78bcf76a1..b40ed3f4afc5b4176f02c2ad7d16a5ce19f2614b 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
@@ -40,50 +40,61 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , md_range ) {
-  TestMDRange_2D< Kokkos::Serial >::test_for2(100,100);
+TEST_F( serial , mdrange_for )
+{
+  TestMDRange_2D< Kokkos::Serial >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Serial >::test_for3( 100, 10, 100 );
+  TestMDRange_4D< Kokkos::Serial >::test_for4( 100, 10, 10, 10 );
+  TestMDRange_5D< Kokkos::Serial >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< Kokkos::Serial >::test_for6( 10, 10, 10, 10, 5, 5 );
+}
 
-  TestMDRange_3D< Kokkos::Serial >::test_for3(100,100,100);
+TEST_F( serial , mdrange_reduce )
+{
+  TestMDRange_2D< Kokkos::Serial >::test_reduce2( 100, 100 );
+  TestMDRange_3D< Kokkos::Serial >::test_reduce3( 100, 10, 100 );
 }
 
-TEST_F( serial, policy_construction) {
+TEST_F( serial, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::Serial >();
   TestTeamPolicyConstruction< Kokkos::Serial >();
 }
 
-TEST_F( serial , range_tag )
+TEST_F( serial, range_tag )
 {
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( serial , compiler_macros )
+TEST_F( serial, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Serial >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( serial , memory_pool )
+TEST_F( serial, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::Serial >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -97,24 +108,24 @@ TEST_F( serial , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-TEST_F( serial , task_fib )
+TEST_F( serial, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::Serial >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Serial >::run( i );
   }
 }
 
-TEST_F( serial , task_depend )
+TEST_F( serial, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::Serial >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Serial >::run( i );
   }
 }
 
-TEST_F( serial , task_team )
+TEST_F( serial, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::Serial >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Serial >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::Serial >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Serial >::run( 1000 ); // Put back after testing.
 }
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -122,44 +133,40 @@ TEST_F( serial , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-TEST_F( serial , cxx11 )
+TEST_F( serial, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Serial >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( serial, tile_layout )
 {
-  TestTile::test< Kokkos::Serial , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Serial , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Serial , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Serial , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::Serial, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Serial, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Serial, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Serial, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Serial, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Serial, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Serial, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Serial, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Serial, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Serial, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Serial, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Serial, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Serial, 8, 8 >( 9, 11 );
 }
 
-
-
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
index 25b5ac6d16a8d101dd1e7d940007a107d1c814fc..8a3d518cfbea93b97d9a885ac061a79494676362 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
@@ -40,83 +40,90 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, long_reduce) {
-  TestReduce< long ,   Kokkos::Serial >( 0 );
-  TestReduce< long ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, long_reduce )
+{
+  TestReduce< long, Kokkos::Serial >( 0 );
+  TestReduce< long, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial, double_reduce) {
-  TestReduce< double ,   Kokkos::Serial >( 0 );
-  TestReduce< double ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, double_reduce )
+{
+  TestReduce< double, Kokkos::Serial >( 0 );
+  TestReduce< double, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial , reducers )
+TEST_F( serial, reducers )
 {
-  TestReducers<int, Kokkos::Serial>::execute_integer();
-  TestReducers<size_t, Kokkos::Serial>::execute_integer();
-  TestReducers<double, Kokkos::Serial>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::Serial>::execute_basic();
+  TestReducers< int, Kokkos::Serial >::execute_integer();
+  TestReducers< size_t, Kokkos::Serial >::execute_integer();
+  TestReducers< double, Kokkos::Serial >::execute_float();
+  TestReducers< Kokkos::complex<double >, Kokkos::Serial>::execute_basic();
 }
 
-TEST_F( serial, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Serial >( 0 );
-  TestReduceDynamic< long ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::Serial >( 0 );
+  TestReduceDynamic< long, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Serial >( 0 );
-  TestReduceDynamic< double ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::Serial >( 0 );
+  TestReduceDynamic< double, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Serial >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::Serial >( 0 );
+  TestReduceDynamicView< long, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial , scan )
+TEST_F( serial, scan )
 {
-  TestScan< Kokkos::Serial >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Serial >::test_range( 1, 1000 );
   TestScan< Kokkos::Serial >( 0 );
   TestScan< Kokkos::Serial >( 10 );
   TestScan< Kokkos::Serial >( 10000 );
 }
 
-TEST_F( serial  , team_scan )
+TEST_F( serial, team_scan )
 {
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( serial , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( serial, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( serial , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( serial, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( serial , reduction_deduction )
+TEST_F( serial, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
index bc838ccde4b36cf964d0da97500fdbd921a85aa0..3dc3e2019d9fd3927f422c689bfbd65fc45a997b 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Serial >();
+TEST_F( serial, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Serial >();
+TEST_F( serial, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Serial >();
+TEST_F( serial, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_assign_strided ) {
+TEST_F( serial, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_0 ) {
+TEST_F( serial, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_1 ) {
+TEST_F( serial, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_2 ) {
+TEST_F( serial, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_3 ) {
+TEST_F( serial, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_right_0 ) {
+TEST_F( serial, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_right_1 ) {
+TEST_F( serial, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_right_3 ) {
+TEST_F( serial, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
index e6a5b56d3ed48ac2301e56b944e4924dcb79451e..536c3bf1979a5b3b9bc33cd8768a86ca3367a8c7 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_layoutleft_to_layoutleft) {
+TEST_F( serial, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( serial, view_subview_layoutright_to_layoutright) {
+TEST_F( serial, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
index 0b7a0d3bfa6fa514195a4fd6241fc262f0ad884d..579a12bf782a34c4739c9e4a30685878dc55900e 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_1d_assign ) {
+TEST_F( serial, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
index 8ca7285c1f8331cb6992411d6b35d7bc054945a3..ff009fef27715a8b366e848267eaa4c6c10bc2d7 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
index 1d156c741524315d2fb66fdc5e852329d846d3ae..a20478433cd2b87f0e07a0e793143c4f6f2ddf40 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
index ebf0e5c99155afe17dea3807981d712e1d67c601..a34b26d9f79317b90dd0bfaf06385ad638d4757f 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_2d_from_3d ) {
+TEST_F( serial, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
index 74acb92f1b9e632a980b7d0141a54200aebbfd15..6d1882cf04e3d384773d384215cd0244ebd8cfcd 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
index 8075d46e0fe15c4c15a47e80f6172d4990fd6ce5..12fb883b63e12812c947facc4b070c0577d09783 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
index 9ce8222643a5d3a183fad578013945a67efd6847..8aae20c0239d5a6272879887c7626f0e1a0e2f2a 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_left ) {
+TEST_F( serial, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
index c8a5c8f33fdc70a2408aade42f21b3c451753b4c..e75db8d52dc1250b582d62c7e51b6bda8ce00b9b 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
index b66f15f17da1b7f0bcb24459678965dacee04f9b..b9cea2ce89c6f2bb311299ee6463ac34185245d8 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
index 5e5e3cf3d1af0f0755ab8fa3f8be9f846ff554e9..e5dbcead376ebdcb37a4bb79dfdfe1916b3e2d0d 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_right ) {
+TEST_F( serial, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
index 55a353bcafef5e852ec33c80d9084f7c2236efcc..3005030f934551a0f8ea5d6be7772cfefa605a98 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
index a168e1e232ff5f71cce593be776496cbd7dd6c25..fee8cb7af2a20cdebafa9270932cda2457363602 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
index a489b0fcb585aa0e12310f09a0701188b8814045..24dc6b5061412c04998f734cab9f1367a9b7d4fe 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
@@ -1,12 +1,12 @@
-#include<serial/TestSerial_SubView_c01.cpp>
-#include<serial/TestSerial_SubView_c02.cpp>
-#include<serial/TestSerial_SubView_c03.cpp>
-#include<serial/TestSerial_SubView_c04.cpp>
-#include<serial/TestSerial_SubView_c05.cpp>
-#include<serial/TestSerial_SubView_c06.cpp>
-#include<serial/TestSerial_SubView_c07.cpp>
-#include<serial/TestSerial_SubView_c08.cpp>
-#include<serial/TestSerial_SubView_c09.cpp>
-#include<serial/TestSerial_SubView_c10.cpp>
-#include<serial/TestSerial_SubView_c11.cpp>
-#include<serial/TestSerial_SubView_c12.cpp>
+#include <serial/TestSerial_SubView_c01.cpp>
+#include <serial/TestSerial_SubView_c02.cpp>
+#include <serial/TestSerial_SubView_c03.cpp>
+#include <serial/TestSerial_SubView_c04.cpp>
+#include <serial/TestSerial_SubView_c05.cpp>
+#include <serial/TestSerial_SubView_c06.cpp>
+#include <serial/TestSerial_SubView_c07.cpp>
+#include <serial/TestSerial_SubView_c08.cpp>
+#include <serial/TestSerial_SubView_c09.cpp>
+#include <serial/TestSerial_SubView_c10.cpp>
+#include <serial/TestSerial_SubView_c11.cpp>
+#include <serial/TestSerial_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp
index df400b4cb51587b76992c26ff28419b334b5d2d6..f13b2ce1b4bd20e92509fc9dc1801352ff3bb289 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp
@@ -40,62 +40,68 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , team_tag )
+TEST_F( serial, team_tag )
 {
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( serial , team_shared_request) {
-  TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( serial, team_shared_request )
+{
+  TestSharedTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( serial, team_scratch_request) {
-  TestScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( serial, team_scratch_request )
+{
+  TestScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( serial , team_lambda_shared_request) {
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( serial, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( serial, shmem_size) {
+TEST_F( serial, shmem_size )
+{
   TestShmemSize< Kokkos::Serial >();
 }
 
-TEST_F( serial, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( serial, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( serial , team_vector )
+TEST_F( serial, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 10 ) ) );
 }
 
 #ifdef KOKKOS_COMPILER_GNU
@@ -107,11 +113,10 @@ TEST_F( serial , team_vector )
 #ifndef SKIP_TEST
 TEST_F( serial, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048, 16, 16 );
 }
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp
index 4c655fe770f26fd8d6b239251c5d6301140faa09..2192159b8439a2b4fdd0fcc38b3be4d382973821 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , impl_view_mapping_a ) {
+TEST_F( serial, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::Serial >();
   test_view_mapping_operator< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
index 4947f2eaaef607b04d680a7c9c64ae6f2d8e6087..8c48ad2ceda81ca46913e3d3206fac96e492950a 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
@@ -40,82 +40,85 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >();
+TEST_F( serial, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Serial >();
 }
 
-TEST_F( serial , impl_view_mapping_b ) {
+TEST_F( serial, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::Serial >();
   TestViewMappingAtomic< Kokkos::Serial >::run();
 }
 
-TEST_F( serial, view_api) {
-  TestViewAPI< double , Kokkos::Serial >();
+TEST_F( serial, view_api )
+{
+  TestViewAPI< double, Kokkos::Serial >();
 }
 
-TEST_F( serial , view_nested_view )
+TEST_F( serial, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Serial >();
 }
 
-
-
-TEST_F( serial , view_remap )
+TEST_F( serial, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::Serial > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Serial > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Serial > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Serial > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Serial > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Serial > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( serial , view_aggregate )
+TEST_F( serial, view_aggregate )
 {
   TestViewAggregate< Kokkos::Serial >();
 }
 
-TEST_F( serial , template_meta_functions )
+TEST_F( serial, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::Serial >();
+  TestTemplateMetaFunctions< int, Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads.hpp b/lib/kokkos/core/unit_test/threads/TestThreads.hpp
index 4f611cf99c7c0e4f3c4b26f0fada9c7c8469ddbe..0afd6772fefff3e2efd7d490d35f985346163fd6 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads.hpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads.hpp
@@ -40,11 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_THREADS_HPP
 #define KOKKOS_TEST_THREADS_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
+
 #ifdef KOKKOS_LAMBDA
 #undef KOKKOS_LAMBDA
 #endif
@@ -53,13 +56,8 @@
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
@@ -74,15 +72,11 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
@@ -95,13 +89,13 @@ protected:
     const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
     const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
 
-    unsigned threads_count = 0 ;
+    unsigned threads_count = 0;
 
-    threads_count = std::max( 1u , numa_count )
-                  * std::max( 2u , cores_per_numa * threads_per_core );
+    threads_count = std::max( 1u, numa_count )
+                  * std::max( 2u, cores_per_numa * threads_per_core );
 
     Kokkos::Threads::initialize( threads_count );
-    Kokkos::Threads::print_configuration( std::cout , true /* detailed */ );
+    Kokkos::print_configuration( std::cout, true /* detailed */ );
   }
 
   static void TearDownTestCase()
@@ -110,6 +104,6 @@ protected:
   }
 };
 
+} // namespace Test
 
-}
 #endif
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
index 6e24c4973ed7c37ff559a5ad023a69fabb607b29..d2a5ea5d6352acc79606082fd75c465b0b5b515e 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
@@ -40,165 +40,161 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , atomics )
+TEST_F( threads, atomics )
 {
-  const int loop_count = 1e4 ;
+  const int loop_count = 1e4;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Threads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Threads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Threads >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Threads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Threads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Threads >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Threads>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Threads>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Threads>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Threads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Threads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Threads >( 100, 3 ) ) );
 }
 
-TEST_F( threads , atomic_operations )
+TEST_F( threads, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 4 ) ) );
   }
-
 }
 
-
-TEST_F( threads , atomic_views_integral )
+TEST_F( threads, atomic_views_integral )
 {
   const long length = 1000000;
   {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 8 ) ) );
-
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 8 ) ) );
   }
 }
 
-TEST_F( threads , atomic_views_nonintegral )
+TEST_F( threads, atomic_views_nonintegral )
 {
   const long length = 1000000;
   {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 4 ) ) );
-
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 4 ) ) );
   }
 }
 
-TEST_F( threads , atomic_view_api )
+TEST_F( threads, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::Threads>();
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
index ac0356eeb4c9c15d5409c0e9d10a772941de57d0..7d268c14547e4680c1ad57d8e66e2b1a4bfaf501 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
@@ -40,65 +40,74 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , init ) {
+TEST_F( threads, init )
+{
   ;
 }
 
-TEST_F( threads , md_range ) {
-  TestMDRange_2D< Kokkos::Threads >::test_for2(100,100);
+TEST_F( threads , mdrange_for ) {
+  TestMDRange_2D< Kokkos::Threads >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Threads >::test_for3( 100, 10, 100 );
+  TestMDRange_4D< Kokkos::Threads >::test_for4( 100, 10, 10, 10 );
+  TestMDRange_5D< Kokkos::Threads >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< Kokkos::Threads >::test_for6( 10, 10, 10, 10, 5, 5 );
+}
 
-  TestMDRange_3D< Kokkos::Threads >::test_for3(100,100,100);
+TEST_F( threads , mdrange_reduce ) {
+  TestMDRange_2D< Kokkos::Threads >::test_reduce2( 100, 100 );
+  TestMDRange_3D< Kokkos::Threads >::test_reduce3( 100, 10, 100 );
 }
 
-TEST_F( threads, policy_construction) {
+TEST_F( threads, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::Threads >();
   TestTeamPolicyConstruction< Kokkos::Threads >();
 }
 
-TEST_F( threads , range_tag )
+TEST_F( threads, range_tag )
 {
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(0);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(3);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( threads , compiler_macros )
+TEST_F( threads, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Threads >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( threads , memory_pool )
+TEST_F( threads, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::Threads >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -112,24 +121,24 @@ TEST_F( threads , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 /*
-TEST_F( threads , task_fib )
+TEST_F( threads, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::Threads >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Threads >::run( i );
   }
 }
 
-TEST_F( threads , task_depend )
+TEST_F( threads, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::Threads >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Threads >::run( i );
   }
 }
 
-TEST_F( threads , task_team )
+TEST_F( threads, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::Threads >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Threads >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::Threads >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Threads >::run( 1000 ); // Put back after testing.
 }
 */
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -137,53 +146,51 @@ TEST_F( threads , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-TEST_F( threads , cxx11 )
+TEST_F( threads, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Threads >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Threads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( threads, tile_layout )
 {
-  TestTile::test< Kokkos::Threads , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Threads , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Threads , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Threads , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::Threads, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Threads, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Threads, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Threads, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Threads, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Threads, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Threads, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Threads, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Threads, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Threads, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Threads, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Threads, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Threads, 8, 8 >( 9, 11 );
 }
 
-
-TEST_F( threads , dispatch )
+TEST_F( threads, dispatch )
 {
-  const int repeat = 100 ;
-  for ( int i = 0 ; i < repeat ; ++i ) {
-  for ( int j = 0 ; j < repeat ; ++j ) {
-    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Threads >(0,j)
-                        , KOKKOS_LAMBDA( int ) {} );
-  }}
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Threads >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
index a637d1e3ab654b402e49b7d3aec582e425d2592a..d2b75ca892b5abcf3f405aec37459f53c2a3aafc 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
@@ -40,46 +40,52 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, long_reduce) {
-  TestReduce< long ,   Kokkos::Threads >( 0 );
-  TestReduce< long ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, long_reduce )
+{
+  TestReduce< long, Kokkos::Threads >( 0 );
+  TestReduce< long, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads, double_reduce) {
-  TestReduce< double ,   Kokkos::Threads >( 0 );
-  TestReduce< double ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, double_reduce )
+{
+  TestReduce< double, Kokkos::Threads >( 0 );
+  TestReduce< double, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads , reducers )
+TEST_F( threads, reducers )
 {
-  TestReducers<int, Kokkos::Threads>::execute_integer();
-  TestReducers<size_t, Kokkos::Threads>::execute_integer();
-  TestReducers<double, Kokkos::Threads>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::Threads>::execute_basic();
+  TestReducers< int, Kokkos::Threads >::execute_integer();
+  TestReducers< size_t, Kokkos::Threads >::execute_integer();
+  TestReducers< double, Kokkos::Threads >::execute_float();
+  TestReducers< Kokkos::complex<double>, Kokkos::Threads >::execute_basic();
 }
 
-TEST_F( threads, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Threads >( 0 );
-  TestReduceDynamic< long ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::Threads >( 0 );
+  TestReduceDynamic< long, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Threads >( 0 );
-  TestReduceDynamic< double ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::Threads >( 0 );
+  TestReduceDynamic< double, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Threads >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::Threads >( 0 );
+  TestReduceDynamicView< long, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads , scan )
+TEST_F( threads, scan )
 {
-  TestScan< Kokkos::Threads >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Threads >::test_range( 1, 1000 );
   TestScan< Kokkos::Threads >( 0 );
   TestScan< Kokkos::Threads >( 100000 );
   TestScan< Kokkos::Threads >( 10000000 );
@@ -87,10 +93,11 @@ TEST_F( threads , scan )
 }
 
 #if 0
-TEST_F( threads , scan_small )
+TEST_F( threads, scan_small )
 {
-  typedef TestScan< Kokkos::Threads , Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor ;
-  for ( int i = 0 ; i < 1000 ; ++i ) {
+  typedef TestScan< Kokkos::Threads, Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
     TestScanFunctor( 10 );
     TestScanFunctor( 10000 );
   }
@@ -101,38 +108,39 @@ TEST_F( threads , scan_small )
 }
 #endif
 
-TEST_F( threads  , team_scan )
+TEST_F( threads, team_scan )
 {
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( threads , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( threads, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( threads , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( threads, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( threads , reduction_deduction )
+TEST_F( threads, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
index 2df9e19deb0130359d81b8c3cc001bb85ee7cb2f..68a9da6aedef550e94c037df93ff6dc741ff3589 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Threads >();
+TEST_F( threads, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Threads >();
+TEST_F( threads, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Threads >();
+TEST_F( threads, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_assign_strided ) {
+TEST_F( threads, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_0 ) {
+TEST_F( threads, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_1 ) {
+TEST_F( threads, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_2 ) {
+TEST_F( threads, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_3 ) {
+TEST_F( threads, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_right_0 ) {
+TEST_F( threads, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_right_1 ) {
+TEST_F( threads, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_right_3 ) {
+TEST_F( threads, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
index d57dbe97c0d38aaa6a2e48816eb9872a8585afb7..c5cf061e8289d9d8ac5ffea92d38c9cd91349922 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_layoutleft_to_layoutleft) {
+TEST_F( threads, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( threads, view_subview_layoutright_to_layoutright) {
+TEST_F( threads, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
index 67d998c0e86488df0023cc0138ffe022cdc52d94..9018c1f4f799c1f76ee082c57dedc644627c7a75 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_1d_assign ) {
+TEST_F( threads, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
index e340240c48d6d28c9bc4c79b777a3e1a4a8c4ddc..9483abd9cc3f78430f2234c71708fe0315a949a9 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
index ad27fa0fa6cee9db3eb63c581a175eee0cdd6e4e..e252a26565bf6dad6387b87340c5c93cd2b3415f 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
index 6fca47cc4ce41b56155fac8ce1d4b158d5e99c82..3e211b1a58542b6307a731c3765190e91132d4dd 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_2d_from_3d ) {
+TEST_F( threads, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
index c7dfca941582dee3d667f60152854ea30b393548..865d50b1a1b918b99fb36d2a3e5c889a7c93e5a7 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
index 38e8394918614fdb528e9111d7fc1f54c7ff4d83..c5840073b6486226281942bfd0c0ad8e2052ff85 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
index 1f01fe6b5e6104416bb1f2f680cafeab48cac1ad..7b8825ef628dbaa4449f7830abd4e227d842dccc 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_left ) {
+TEST_F( threads, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
index e9a1ccbe30edcf7f512a5c20462df83cf52c3ac4..7bc16a5827a602193db55f7ffa044b38babef77d 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
index c8b6c8743dd25a97db5f00e5bc7157c9f040c5d9..57b87b6098bdd818c8e215ffb1d5938043746494 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
index 7cef6fa07be88859c063470857d775964c74f2fa..1875a883d485e1620430cadc59c09554dfc00ac1 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_right ) {
+TEST_F( threads, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
index d67bf3157e337fef0af36dbba934f8bc22d74d0c..cf6428b18e333d66f4637fc92a45dc7f51052cc6 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
index e8a2c825cf3a9474d149d81a225cbadb16338cd7..7060fdb273c928d7346686c54d0a374188c47257 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp
index 4690be4d3a75d8e5a7b66676ecf6b0482952d116..d802d658309b4ecfbd28a5ec4ce6d17edc4a5f4a 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp
@@ -40,67 +40,73 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , team_tag )
+TEST_F( threads, team_tag )
 {
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
 
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( threads , team_shared_request) {
-  TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( threads, team_shared_request )
+{
+  TestSharedTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( threads, team_scratch_request) {
-  TestScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( threads, team_scratch_request )
+{
+  TestScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( threads , team_lambda_shared_request) {
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( threads, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( threads, shmem_size) {
+TEST_F( threads, shmem_size )
+{
   TestShmemSize< Kokkos::Threads >();
 }
 
-TEST_F( threads, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( threads, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( threads , team_vector )
+TEST_F( threads, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 10 ) ) );
 }
 
 #ifdef KOKKOS_COMPILER_GNU
@@ -112,11 +118,10 @@ TEST_F( threads , team_vector )
 #ifndef SKIP_TEST
 TEST_F( threads, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048, 16, 16 );
 }
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp
index 46a576b027fb2149302239ba31d6e53bd001e3ce..36eae287936ad9854dd030fc304506c3d3745c03 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , impl_view_mapping_a ) {
+TEST_F( threads, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::Threads >();
   test_view_mapping_operator< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
index b5d6ac843d8177149d53fe1cb52528c6ef760f3d..8c78d094435b3f524668cb1bffa44b5144749063 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
@@ -40,82 +40,85 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::HostSpace , Kokkos::Threads >();
+TEST_F( threads, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Threads >();
 }
 
-TEST_F( threads , impl_view_mapping_b ) {
+TEST_F( threads, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::Threads >();
   TestViewMappingAtomic< Kokkos::Threads >::run();
 }
 
-TEST_F( threads, view_api) {
-  TestViewAPI< double , Kokkos::Threads >();
+TEST_F( threads, view_api )
+{
+  TestViewAPI< double, Kokkos::Threads >();
 }
 
-TEST_F( threads , view_nested_view )
+TEST_F( threads, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Threads >();
 }
 
-
-
-TEST_F( threads , view_remap )
+TEST_F( threads, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::Threads > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Threads > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Threads > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Threads > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Threads > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Threads > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( threads , view_aggregate )
+TEST_F( threads, view_aggregate )
 {
   TestViewAggregate< Kokkos::Threads >();
 }
 
-TEST_F( threads , template_meta_functions )
+TEST_F( threads, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::Threads >();
+  TestTemplateMetaFunctions< int, Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/doc/design_notes_space_instances.md b/lib/kokkos/doc/design_notes_space_instances.md
index 487fa25bcb32875ed3ba90821aba006a13cd506e..0124dfbc873285255fa92ff171dc5873056495ab 100644
--- a/lib/kokkos/doc/design_notes_space_instances.md
+++ b/lib/kokkos/doc/design_notes_space_instances.md
@@ -1,35 +1,41 @@
 # Design Notes for Execution and Memory Space Instances
 
+## Objective
 
-## Execution Spaces
+ * Enable Kokkos interoperability with coarse-grain tasking models
+ 
+## Requirements
 
-  *  Work is *dispatched* to an execution space instance
+ * Backwards compatable with existing Kokkos API
+ * Support existing Host execution spaces (Serial, Threads, OpenMP, maybe Qthreads)
+ * Support DARMA threading model (may require a new Host execution space)
+ * Support Uintah threading model, i.e. indepentant worker threadpools working of of shared task queues
+ 
+  
+## Execution Space
 
+  * Parallel work is *dispatched* on an execution space instance
+  
+  * Execution space instances are conceptually disjoint/independant from each other 
+  
 
-
-## Host Associated Execution Space Instances
-
-Vocabulary and examples assuming C++11 Threads Support Library
+## Host Execution Space Instances
 
   *  A host-side *control* thread dispatches work to an instance
 
-  * `this_thread` is the control thread
-
   * `main` is the initial control thread
 
-  *  An execution space instance is a pool of threads
+  *  A host execution space instance is an organized thread pool
 
-  *  All instances are disjoint thread pools
+  *  All instances are disjoint, i.e. hardware resources are not shared between instances
 
   *  Exactly one control thread is associated with
      an instance and only that control thread may
      dispatch work to to that instance
 
-  *  A control thread may be a member of an instance,
-     if so then it is also the control thread associated
-     with that instance
+  *  The control thread is a member of the instance
 
-  *  The pool of threads associated with an instances is not mutatable
+  *  The pool of threads associated with an instances is not mutatable during that instance existance
 
   *  The pool of threads associated with an instance may be masked
 
@@ -37,130 +43,89 @@ Vocabulary and examples assuming C++11 Threads Support Library
 
     -  Example: only one hyperthread per core of the instance
 
-    -  When a mask is applied to an instance that mask
-       remains until cleared or another mask is applied
-
-    -  Masking is portable by defining it as using a fraction
-       of the available resources (threads)
-
-  *  Instances are shared (referenced counted) objects,
-     just like `Kokkos::View`
-
-```
-struct StdThread {
-  void mask( float fraction );
-  void unmask() { mask( 1.0 ); }
-};
-```
-
-
-
-### Requesting an Execution Space Instance
-
-  *  `Space::request(` *who* `,` *what* `,` *control-opt* `)`
-
-  *  *who* is an identifier for subsquent queries regarding
-    who requested each instance
-
-  *  *what* is the number of threads and how they should be placed
-
-    -  Placement within locality-topology hierarchy; e.g., HWLOC
-
-    -  Compact within a level of hierarchy, or striped across that level;
-       e.g., socket or NUMA region
-
-    -  Granularity of request is core
-
-  *  *control-opt*  optionally specifies whether the instance
-     has a new control thread
-
-    -  *control-opt* includes a control function / closure
-
-    -  The new control thread is a member of the instance
-
-    -  The control function is called by the new control thread
-       and is passed a `const` instance
-
-    -  The instance is **not** returned to the creating control thread
-
-  *  `std::thread` that is not a member of an instance is
-     *hard blocked* on a `std::mutex`
-
-    -  One global mutex or one mutex per thread?
-
-  *  `std::thread` that is a member of an instance is
-     *spinning* waiting for work, or are working
-
-```
-struct StdThread {
-
-  struct Resource ;
-
-  static StdThread request(); // default
+    -  A mask can be applied during the policy creation of a parallel algorithm
+ 
+    -  Masking is portable by defining it as ceiling of fraction between [0.0, 1.0] 
+       of the available resources
 
-  static StdThread request( const std::string & , const Resource & );
-
-  // If the instance can be reserved then
-  // allocate a copy of ControlClosure and invoke
-  //   ControlClosure::operator()( const StdThread intance ) const
-  template< class ControlClosure >
-  static bool request( const std::string & , const Resource &
-                     , const ControlClosure & );
-};
 ```
-
-### Relinquishing an Execution Space Instance
-
-  *  De-referencing the last reference-counted instance
-     relinquishes the pool of threads
-
-  *  If a control thread was created for the instance then
-     it is relinquished when that control thread returns
-     from the control function
-
-    -  Requires the reference count to be zero, an error if not
-
-  *  No *forced* relinquish
-
-
-
-## CUDA Associated Execution Space Instances
-
-  *  Only a signle CUDA architecture
-
-  *  An instance is a device + stream
-
-  *  A stream is exclusive to an instance
-
-  *  Only a host-side control thread can dispatch work to an instance
-
-  *  Finite number of streams per device
-
-  *  ISSUE:  How to use CUDA `const` memory with multiple streams?
-
-  *  Masking can be mapped to restricting the number of CUDA blocks
-     to the fraction of available resources; e.g., maximum resident blocks
-
-
-### Requesting an Execution Space Instance
-
-  *  `Space::request(` *who* `,` *what* `)`
-
-  *  *who* is an identifier for subsquent queries regarding
-    who requested each instance
-
-  *  *what* is which device, the stream is a requested/relinquished resource
-
+class ExecutionSpace {
+public:
+  using execution_space = ExecutionSpace;
+  using memory_space = ...;
+  using device_type = Kokkos::Device<execution_space, memory_space>;
+  using array_layout = ...;
+  using size_type = ...;
+  using scratch_memory_space = ...;
+  
+  
+  class Instance
+  {
+    int thread_pool_size( int depth = 0 );
+    ...
+  };
+  
+  class InstanceRequest
+  {
+  public:
+    using Control = std::function< void( Instance * )>;
+    
+    InstanceRequest( Control control
+                   , unsigned thread_count
+                   , unsigned use_numa_count = 0
+                   , unsigned use_cores_per_numa = 0
+                   );    
+  
+  };
+  
+  static bool in_parallel();
+  
+  static bool sleep();
+  static bool wake();
+  
+  static void fence();
+  
+  static void print_configuration( std::ostream &, const bool detailed = false );
+  
+  static void initialize( unsigned thread_count = 0
+                        , unsigned use_numa_count = 0
+                        , unsigned use_cores_per_numa = 0
+                        );
+  
+  // Partition the current instance into the requested instances
+  // and run the given functions on the cooresponding instances
+  // will block until all the partitioned instances complete and 
+  // the original instance will be restored 
+  //
+  // Requires that the space has already been initialized
+  // Requires that the request can be statisfied by the current instance
+  //   i.e. the sum of number of requested threads must be less than the 
+  //   max_hardware_threads
+  //
+  // Each control functor will accept a handle to its new default instance
+  // Each instance must be independant of all other instances 
+  //   i.e. no assumption on scheduling between instances
+  // The user is responible for checking the return code for errors
+  static int run_instances( std::vector< InstanceRequest> const& requests );
+  
+  static void finalize();
+
+  static int is_initialized();
+  
+  static int concurrency();
+  
+  static int thread_pool_size( int depth = 0 );
+  
+  static int thread_pool_rank();
+  
+  static int max_hardware_threads();
+  
+  static int hardware_thread_id();
+                        
+ };
 
 ```
-struct Cuda {
+ 
 
-  struct Resource ;
-
-  static Cuda request();
-
-  static Cuda request( const std::string & , const Resource & );
-};
-```
 
 
diff --git a/lib/kokkos/example/md_skeleton/types.h b/lib/kokkos/example/md_skeleton/types.h
index 7f92b7cd0f8089d93c1e18e5dff3ad1508316867..c9689188a1c289c67e08dbe07707a51a0f8bff28 100644
--- a/lib/kokkos/example/md_skeleton/types.h
+++ b/lib/kokkos/example/md_skeleton/types.h
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,7 +50,7 @@
 
 typedef Kokkos::DefaultExecutionSpace execution_space ;
 
-#if ! defined( KOKKOS_HAVE_CUDA )
+#if ! defined( KOKKOS_ENABLE_CUDA )
   struct double2 {
     double x, y;
     KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
index 326d064105ecf2da945cf346cbaa9abbe27eab20..249d44ab559682ce2622842048b47af4613ec16f 100644
--- a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -100,7 +100,7 @@ int main (int argc, char* argv[]) {
   // order.  Parallel for loops may execute in any order.
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-#if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_for (15, KOKKOS_LAMBDA (const int i) {
       // printf works in a CUDA parallel kernel; std::ostream does not.
       printf ("Hello from i = %i\n", i);
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
index 70eea4324022b8bcfd7e1266f5c47ef08380d8c9..f7f467ad2d1dbd866ad185776cea5d45a9abce3c 100644
--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -69,7 +69,7 @@ int main (int argc, char* argv[]) {
   // It also handles any other syntax needed for CUDA.
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-  #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_reduce (n, KOKKOS_LAMBDA (const int i, int& lsum) {
       lsum += i*i;
     }, sum);
@@ -85,7 +85,7 @@ int main (int argc, char* argv[]) {
   printf ("Sum of squares of integers from 0 to %i, "
           "computed sequentially, is %i\n", n - 1, seqSum);
   Kokkos::finalize ();
-#if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   return (sum == seqSum) ? 0 : -1;
 #else
   return 0;
diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp b/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
index dd0641be54087a76d45505d0e6777a4ebe1fd9d1..3450ad1bb468095a9d821a1c8e0560b256607166 100644
--- a/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -99,7 +99,7 @@ int main (int argc, char* argv[]) {
   // ask for one.
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-  #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_for (10, KOKKOS_LAMBDA (const int i) {
     // Acesss the View just like a Fortran array.  The layout depends
     // on the View's memory space, so don't rely on the View's
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
index 216db7f125d16ed7150f2f2049506a723e9dcc79..9ea5e8b70711942cb61ef29f38144b52f81137e0 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -79,7 +79,7 @@ int main (int narg, char* args[]) {
   int sum = 0;
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-  #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
       lsum += 1;
       // TeamPolicy<>::member_type provides functions to query the
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
index e7bd9da36b4c1eaf60125e6c38f5e3bf7d33bf5d..e671293ff11ad8120766ed014128b25fb39089bc 100755
--- a/lib/kokkos/generate_makefile.bash
+++ b/lib/kokkos/generate_makefile.bash
@@ -5,153 +5,166 @@ MAKE_J_OPTION="32"
 
 while [[ $# > 0 ]]
 do
-key="$1"
+  key="$1"
 
-case $key in
+  case $key in
     --kokkos-path*)
-    KOKKOS_PATH="${key#*=}"
-    ;;
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
     --prefix*)
-    PREFIX="${key#*=}"
-    ;;
+      PREFIX="${key#*=}"
+      ;;
     --with-cuda)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
-    CUDA_PATH_NVCC=`which nvcc`
-    CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+      CUDA_PATH_NVCC=`which nvcc`
+      CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
+      ;;
     # Catch this before '--with-cuda*'
     --with-cuda-options*)
-    KOKKOS_CUDA_OPT="${key#*=}"
-    ;;
+      KOKKOS_CUDA_OPT="${key#*=}"
+      ;;
     --with-cuda*)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
-    CUDA_PATH="${key#*=}"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+      CUDA_PATH="${key#*=}"
+      ;;
     --with-openmp)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
+      ;;
     --with-pthread)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
+      ;;
     --with-serial)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
-    ;;
-    --with-qthread*)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Qthread"
-    QTHREAD_PATH="${key#*=}"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
+      ;;
+    --with-qthreads*)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Qthreads"
+      if [ -z "$QTHREADS_PATH" ]; then
+        QTHREADS_PATH="${key#*=}"
+      fi
+      ;;
     --with-devices*)
-    DEVICES="${key#*=}"
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
-    ;;
+      DEVICES="${key#*=}"
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
+      ;;
     --with-gtest*)
-    GTEST_PATH="${key#*=}"
-    ;;
+      GTEST_PATH="${key#*=}"
+      ;;
     --with-hwloc*)
-    HWLOC_PATH="${key#*=}"
-    ;;
+      HWLOC_PATH="${key#*=}"
+      ;;
     --arch*)
-    KOKKOS_ARCH="${key#*=}"
-    ;;
+      KOKKOS_ARCH="${key#*=}"
+      ;;
     --cxxflags*)
-    CXXFLAGS="${key#*=}"
-    ;;
+      CXXFLAGS="${key#*=}"
+      ;;
     --ldflags*)
-    LDFLAGS="${key#*=}"
-    ;;
+      LDFLAGS="${key#*=}"
+      ;;
     --debug|-dbg)
-    KOKKOS_DEBUG=yes
-    ;;
+      KOKKOS_DEBUG=yes
+      ;;
     --make-j*)
-    MAKE_J_OPTION="${key#*=}"
-    ;;
+      MAKE_J_OPTION="${key#*=}"
+      ;;
     --compiler*)
-    COMPILER="${key#*=}"
-    CNUM=`which ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l`
-    if [ ${CNUM} -gt 0 ]; then
-      echo "Invalid compiler by --compiler command: '${COMPILER}'"
-      exit
-    fi
-    if [[ ! -n  ${COMPILER} ]]; then
-      echo "Empty compiler specified by --compiler command."
-      exit
-    fi
-    CNUM=`which ${COMPILER} | grep ${COMPILER} | wc -l`
-    if [ ${CNUM} -eq 0 ]; then
-      echo "Invalid compiler by --compiler command: '${COMPILER}'"
-      exit
-    fi 
-    ;;
-    --with-options*)
-    KOKKOS_OPT="${key#*=}"
-    ;;
+      COMPILER="${key#*=}"
+      CNUM=`which ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l`
+      if [ ${CNUM} -gt 0 ]; then
+        echo "Invalid compiler by --compiler command: '${COMPILER}'"
+        exit
+      fi
+      if [[ ! -n  ${COMPILER} ]]; then
+        echo "Empty compiler specified by --compiler command."
+        exit
+      fi
+      CNUM=`which ${COMPILER} | grep ${COMPILER} | wc -l`
+      if [ ${CNUM} -eq 0 ]; then
+        echo "Invalid compiler by --compiler command: '${COMPILER}'"
+        exit
+      fi 
+      ;;
+      --with-options*)
+      KOKKOS_OPT="${key#*=}"
+      ;;
     --help)
-    echo "Kokkos configure options:"
-    echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
-    echo "--prefix=/Install/Path:        Path to where the Kokkos library should be installed"
-    echo ""
-    echo "--with-cuda[=/Path/To/Cuda]:      enable Cuda and set path to Cuda Toolkit"
-    echo "--with-openmp:                    enable OpenMP backend"
-    echo "--with-pthread:                   enable Pthreads backend"
-    echo "--with-serial:                    enable Serial backend"
-    echo "--with-qthread=/Path/To/Qthread:  enable Qthread backend"
-    echo "--with-devices:                   explicitly add a set of backends"
-    echo ""
-    echo "--arch=[OPTIONS]:            set target architectures. Options are:"
-    echo "                               ARMv80         = ARMv8.0 Compatible CPU"
-    echo "                               ARMv81         = ARMv8.1 Compatible CPU"
-    echo "                               ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU"
-    echo "                               SNB            = Intel Sandy/Ivy Bridge CPUs"
-    echo "                               HSW            = Intel Haswell CPUs"
-    echo "                               BDW            = Intel Broadwell Xeon E-class CPUs"
-    echo "                               SKX            = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
-    echo "                               KNC            = Intel Knights Corner Xeon Phi"
-    echo "                               KNL            = Intel Knights Landing Xeon Phi"
-    echo "                               Kepler30       = NVIDIA Kepler generation CC 3.0"
-    echo "                               Kepler35       = NVIDIA Kepler generation CC 3.5"
-    echo "                               Kepler37       = NVIDIA Kepler generation CC 3.7"
-    echo "                               Pascal60       = NVIDIA Pascal generation CC 6.0"
-    echo "                               Pascal61       = NVIDIA Pascal generation CC 6.1"
-    echo "                               Maxwell50      = NVIDIA Maxwell generation CC 5.0"
-    echo "                               Power8         = IBM POWER8 CPUs"
-    echo "                               Power9         = IBM POWER9 CPUs"
-    echo ""
-    echo "--compiler=/Path/To/Compiler set the compiler"
-    echo "--debug,-dbg:                enable Debugging"
-    echo "--cxxflags=[FLAGS]           overwrite CXXFLAGS for library build and test build"
-    echo "                               This will still set certain required flags via"
-    echo "                               KOKKOS_CXXFLAGS (such as -fopenmp, --std=c++11, etc.)"
-    echo "--ldflags=[FLAGS]            overwrite LDFLAGS for library build and test build"
-    echo "                               This will still set certain required flags via"
-    echo "                               KOKKOS_LDFLAGS (such as -fopenmp, -lpthread, etc.)"
-    echo "--with-gtest=/Path/To/Gtest: set path to gtest (used in unit and performance tests"
-    echo "--with-hwloc=/Path/To/Hwloc: set path to hwloc"
-    echo "--with-options=[OPTIONS]:    additional options to Kokkos:"
-    echo "                               aggressive_vectorization = add ivdep on loops"
-    echo "--with-cuda-options=[OPT]:   additional options to CUDA:"
-    echo "                               force_uvm, use_ldg, enable_lambda, rdc"
-    echo "--make-j=[NUM]:              set -j flag used during build."
-    exit 0
-    ;;
+      echo "Kokkos configure options:"
+      echo "--kokkos-path=/Path/To/Kokkos:        Path to the Kokkos root directory."
+      echo "--qthreads-path=/Path/To/Qthreads:    Path to Qthreads install directory."
+      echo "                                        Overrides path given by --with-qthreads."
+      echo "--prefix=/Install/Path:               Path to install the Kokkos library."
+      echo ""
+      echo "--with-cuda[=/Path/To/Cuda]:          Enable Cuda and set path to Cuda Toolkit."
+      echo "--with-openmp:                        Enable OpenMP backend."
+      echo "--with-pthread:                       Enable Pthreads backend."
+      echo "--with-serial:                        Enable Serial backend."
+      echo "--with-qthreads[=/Path/To/Qthreads]:  Enable Qthreads backend."
+      echo "--with-devices:                       Explicitly add a set of backends."
+      echo ""
+      echo "--arch=[OPT]:  Set target architectures. Options are:"
+      echo "                 ARMv80         = ARMv8.0 Compatible CPU"
+      echo "                 ARMv81         = ARMv8.1 Compatible CPU"
+      echo "                 ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU"
+      echo "                 SNB            = Intel Sandy/Ivy Bridge CPUs"
+      echo "                 HSW            = Intel Haswell CPUs"
+      echo "                 BDW            = Intel Broadwell Xeon E-class CPUs"
+      echo "                 SKX            = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
+      echo "                 KNC            = Intel Knights Corner Xeon Phi"
+      echo "                 KNL            = Intel Knights Landing Xeon Phi"
+      echo "                 Kepler30       = NVIDIA Kepler generation CC 3.0"
+      echo "                 Kepler35       = NVIDIA Kepler generation CC 3.5"
+      echo "                 Kepler37       = NVIDIA Kepler generation CC 3.7"
+      echo "                 Pascal60       = NVIDIA Pascal generation CC 6.0"
+      echo "                 Pascal61       = NVIDIA Pascal generation CC 6.1"
+      echo "                 Maxwell50      = NVIDIA Maxwell generation CC 5.0"
+      echo "                 Power8         = IBM POWER8 CPUs"
+      echo "                 Power9         = IBM POWER9 CPUs"
+      echo ""
+      echo "--compiler=/Path/To/Compiler  Set the compiler."
+      echo "--debug,-dbg:                 Enable Debugging."
+      echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
+      echo "                                build.  This will still set certain required"
+      echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
+      echo "                                --std=c++11, etc.)."
+      echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
+      echo "                                build. This will still set certain required"
+      echo "                                flags via KOKKOS_LDFLAGS (such as -fopenmp,"
+      echo "                                -lpthread, etc.)."
+      echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
+      echo "                                tests.)"
+      echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc."
+      echo "--with-options=[OPT]:         Additional options to Kokkos:"
+      echo "                                aggressive_vectorization = add ivdep on loops"
+      echo "--with-cuda-options=[OPT]:    Additional options to CUDA:"
+      echo "                                force_uvm, use_ldg, enable_lambda, rdc"
+      echo "--make-j=[NUM]:               Set -j flag used during build."
+      exit 0
+      ;;
     *)
-    echo "warning: ignoring unknown option $key"
-    ;;
-esac
-shift
+      echo "warning: ignoring unknown option $key"
+      ;;
+  esac
+
+  shift
 done
 
-# If KOKKOS_PATH undefined, assume parent dir of this
-# script is the KOKKOS_PATH
+# Remove leading ',' from KOKKOS_DEVICES.
+KOKKOS_DEVICES=$(echo $KOKKOS_DEVICES | sed 's/^,//')
+
+# If KOKKOS_PATH undefined, assume parent dir of this script is the KOKKOS_PATH.
 if [ -z "$KOKKOS_PATH" ]; then
-    KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+  KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 else
-    # Ensure KOKKOS_PATH is abs path
-    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+  # Ensure KOKKOS_PATH is abs path
+  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
 fi
 
 if [ "${KOKKOS_PATH}"  = "${PWD}" ] || [ "${KOKKOS_PATH}"  = "${PWD}/" ]; then
-echo "Running generate_makefile.sh in the Kokkos root directory is not allowed"
-exit 
+  echo "Running generate_makefile.sh in the Kokkos root directory is not allowed"
+  exit 
 fi
 
 KOKKOS_SRC_PATH=${KOKKOS_PATH}
@@ -160,52 +173,63 @@ KOKKOS_SETTINGS="KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH}"
 #KOKKOS_SETTINGS="KOKKOS_PATH=${KOKKOS_PATH}"
 
 if [ ${#COMPILER} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}"
 fi
+
 if [ ${#KOKKOS_DEVICES} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
 fi
+
 if [ ${#KOKKOS_ARCH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}"
 fi
+
 if [ ${#KOKKOS_DEBUG} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
 fi
+
 if [ ${#CUDA_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}"
 fi
+
 if [ ${#CXXFLAGS} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
 fi
+
 if [ ${#LDFLAGS} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
 fi
+
 if [ ${#GTEST_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
 else
-GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
+  GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
 fi
+
 if [ ${#HWLOC_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc"
 fi
-if [ ${#QTHREAD_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} QTHREAD_PATH=${QTHREAD_PATH}"
+
+if [ ${#QTHREADS_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} QTHREADS_PATH=${QTHREADS_PATH}"
 fi
+
 if [ ${#KOKKOS_OPT} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}"
 fi
+
 if [ ${#KOKKOS_CUDA_OPT} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}"
 fi
 
 KOKKOS_SETTINGS_NO_KOKKOS_PATH="${KOKKOS_SETTINGS}"
 
 KOKKOS_TEST_INSTALL_PATH="${PWD}/install"
 if [ ${#PREFIX} -gt 0 ]; then
-KOKKOS_INSTALL_PATH="${PREFIX}"
+  KOKKOS_INSTALL_PATH="${PREFIX}"
 else
-KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH}
+  KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH}
 fi
 
 
@@ -229,7 +253,7 @@ mkdir example/fenl
 mkdir example/tutorial
 
 if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
-mkdir example/ichol
+  mkdir example/ichol
 fi
 
 KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"