From 236ebf7fabbb3dc07b231b9a74c378c40540b1b4 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Thu, 8 Sep 2016 13:56:18 -0600
Subject: [PATCH] Kokkos lib update

---
 lib/kokkos/CMakeLists.txt                     |   54 +-
 lib/kokkos/Makefile.kokkos                    |   23 +-
 lib/kokkos/Makefile.targets                   |   14 +-
 lib/kokkos/README                             |   31 +-
 lib/kokkos/algorithms/src/Kokkos_Random.hpp   |  349 +-
 .../algorithms/unit_tests/TestRandom.hpp      |   35 +-
 lib/kokkos/cmake/deps/CUDA.cmake              |   79 +
 lib/kokkos/cmake/deps/CUSPARSE.cmake          |   64 +
 lib/kokkos/cmake/deps/HWLOC.cmake             |   70 +
 lib/kokkos/cmake/deps/Pthread.cmake           |   83 +
 lib/kokkos/cmake/deps/QTHREAD.cmake           |   70 +
 lib/kokkos/cmake/tribits.cmake                |  485 +++
 .../kokkos-trilinos-integration-procedure.txt |  153 +
 lib/kokkos/config/master_history.txt          |    3 +
 lib/kokkos/config/nvcc_wrapper                |   46 +-
 lib/kokkos/config/test_all_sandia             |  201 +-
 .../performance_tests/CMakeLists.txt          |   17 +-
 .../containers/performance_tests/TestCuda.cpp |    9 +
 .../performance_tests/TestDynRankView.hpp     |  265 ++
 .../performance_tests/TestGlobal2LocalIds.hpp |    2 +-
 .../performance_tests/TestOpenMP.cpp          |    9 +
 .../performance_tests/TestThreads.cpp         |    9 +
 .../TestUnorderedMapPerformance.hpp           |    4 +-
 .../containers/src/Kokkos_DynRankView.hpp     | 1367 +++++--
 .../containers/src/Kokkos_DynamicView.hpp     |    9 +-
 .../src/impl/Kokkos_Bitset_impl.hpp           |  110 +-
 .../containers/unit_tests/TestDynViewAPI.hpp  |  374 +-
 .../containers/unit_tests/TestDynamicView.hpp |    7 +-
 lib/kokkos/core/cmake/KokkosCore_config.h.in  |    1 +
 lib/kokkos/core/perf_test/CMakeLists.txt      |   15 +-
 lib/kokkos/core/perf_test/PerfTestCuda.cpp    |    2 +-
 .../core/perf_test/PerfTestGramSchmidt.hpp    |    2 +-
 lib/kokkos/core/perf_test/PerfTestHexGrad.hpp |    2 +-
 lib/kokkos/core/perf_test/test_atomic.cpp     |   19 +-
 lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp  |   38 +-
 lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp |  163 +-
 .../core/src/Cuda/Kokkos_Cuda_Alloc.hpp       |    1 -
 .../src/Cuda/Kokkos_Cuda_BasicAllocators.cpp  |  198 -
 .../src/Cuda/Kokkos_Cuda_BasicAllocators.hpp  |  190 -
 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp |  111 +-
 .../core/src/Cuda/Kokkos_Cuda_Internal.hpp    |   40 +-
 .../core/src/Cuda/Kokkos_Cuda_Parallel.hpp    |  774 ++--
 .../core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp  |   18 +-
 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp |  179 +
 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp |  519 +++
 .../core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp  |   12 +-
 .../core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp  |   24 +-
 lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp |  339 --
 .../core/src/KokkosExp_MDRangePolicy.hpp      |  611 +++
 lib/kokkos/core/src/KokkosExp_View.hpp        | 2306 ------------
 lib/kokkos/core/src/Kokkos_Complex.hpp        |   15 +-
 ...asicAllocators.hpp => Kokkos_Concepts.hpp} |   99 +-
 lib/kokkos/core/src/Kokkos_Core.hpp           |   72 -
 lib/kokkos/core/src/Kokkos_Core_fwd.hpp       |    7 +-
 lib/kokkos/core/src/Kokkos_Cuda.hpp           |   15 +-
 lib/kokkos/core/src/Kokkos_CudaSpace.hpp      |   93 +-
 lib/kokkos/core/src/Kokkos_ExecPolicy.hpp     |  231 +-
 lib/kokkos/core/src/Kokkos_HBWSpace.hpp       |   15 -
 lib/kokkos/core/src/Kokkos_HostSpace.hpp      |   22 -
 lib/kokkos/core/src/Kokkos_Macros.hpp         |   31 +-
 lib/kokkos/core/src/Kokkos_MemoryPool.hpp     | 1701 +++++++--
 lib/kokkos/core/src/Kokkos_OpenMP.hpp         |   11 +-
 lib/kokkos/core/src/Kokkos_Pair.hpp           |   25 +-
 lib/kokkos/core/src/Kokkos_Parallel.hpp       |  431 +--
 .../core/src/Kokkos_Parallel_Reduce.hpp       | 1240 ++++++
 lib/kokkos/core/src/Kokkos_ScratchSpace.hpp   |   68 +-
 lib/kokkos/core/src/Kokkos_Serial.hpp         |  185 +-
 lib/kokkos/core/src/Kokkos_TaskPolicy.hpp     |  652 +++-
 lib/kokkos/core/src/Kokkos_Threads.hpp        |   10 +-
 lib/kokkos/core/src/Kokkos_View.hpp           | 3322 +++++++++--------
 .../src/OpenMP/Kokkos_OpenMP_Parallel.hpp     |  120 +-
 .../core/src/OpenMP/Kokkos_OpenMP_Task.cpp    |  329 ++
 .../core/src/OpenMP/Kokkos_OpenMP_Task.hpp    |  356 ++
 .../core/src/OpenMP/Kokkos_OpenMPexec.cpp     |   32 +-
 .../core/src/OpenMP/Kokkos_OpenMPexec.hpp     |  138 +-
 .../core/src/Qthread/Kokkos_QthreadExec.cpp   |    2 +-
 .../core/src/Qthread/Kokkos_QthreadExec.hpp   |   32 +-
 .../src/Qthread/Kokkos_Qthread_Parallel.hpp   |  110 +-
 .../src/Qthread/Kokkos_Qthread_TaskPolicy.cpp |   37 +-
 .../src/Qthread/Kokkos_Qthread_TaskPolicy.hpp |  107 +-
 lib/kokkos/core/src/Qthread/README            |   21 +-
 .../core/src/Threads/Kokkos_ThreadsExec.cpp   |   43 +-
 .../core/src/Threads/Kokkos_ThreadsExec.hpp   |   14 -
 .../core/src/Threads/Kokkos_ThreadsTeam.hpp   |   58 +-
 .../src/Threads/Kokkos_Threads_Parallel.hpp   |  100 +-
 .../src/Threads/Kokkos_Threads_TaskPolicy.cpp |   14 +-
 .../src/Threads/Kokkos_Threads_TaskPolicy.hpp |    5 +-
 .../core/src/impl/KokkosExp_SharedAlloc.hpp   |    2 +-
 .../core/src/impl/KokkosExp_ViewCtor.hpp      |    4 -
 .../core/src/impl/KokkosExp_ViewMapping.hpp   |   24 +-
 .../src/impl/Kokkos_AllocationTracker.cpp     |  848 -----
 .../src/impl/Kokkos_AllocationTracker.hpp     |  574 ---
 .../core/src/impl/Kokkos_AnalyzePolicy.hpp    |  197 +
 .../Kokkos_Atomic_Compare_Exchange_Strong.hpp |   22 +-
 .../core/src/impl/Kokkos_Atomic_Exchange.hpp  |   24 +-
 .../core/src/impl/Kokkos_Atomic_Fetch_Add.hpp |   41 +-
 .../core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp |    2 +-
 .../core/src/impl/Kokkos_Atomic_Generic.hpp   |   40 +
 .../core/src/impl/Kokkos_Atomic_View.hpp      |   36 -
 .../core/src/impl/Kokkos_BasicAllocators.cpp  |  287 --
 lib/kokkos/core/src/impl/Kokkos_BitOps.hpp    |  122 +
 lib/kokkos/core/src/impl/Kokkos_Core.cpp      |    4 +-
 .../core/src/impl/Kokkos_FunctorAdapter.hpp   |  120 +-
 lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp  |   18 -
 lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp |   16 -
 .../src/impl/Kokkos_MemoryPool_Inline.hpp     |  446 ---
 .../core/src/impl/Kokkos_PhysicalLayout.hpp   |   13 +-
 .../src/impl/Kokkos_Profiling_DeviceInfo.hpp  |    2 +-
 .../src/impl/Kokkos_Profiling_Interface.cpp   |   13 +-
 .../src/impl/Kokkos_Profiling_Interface.hpp   |    2 +-
 .../core/src/impl/Kokkos_Serial_Task.cpp      |  147 +
 .../core/src/impl/Kokkos_Serial_Task.hpp      |  271 ++
 .../src/impl/Kokkos_Serial_TaskPolicy.cpp     |   18 +-
 .../src/impl/Kokkos_Serial_TaskPolicy.hpp     |   10 +-
 lib/kokkos/core/src/impl/Kokkos_Tags.hpp      |  137 +-
 lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp |  499 +++
 .../core/src/impl/Kokkos_TaskQueue_impl.hpp   |  569 +++
 lib/kokkos/core/src/impl/Kokkos_Timer.hpp     |    3 +
 lib/kokkos/core/src/impl/Kokkos_Traits.hpp    |   25 +-
 .../core/src/impl/Kokkos_ViewSupport.hpp      |  129 -
 .../core/src/impl/Kokkos_ViewTileLeft.hpp     |  153 -
 lib/kokkos/core/unit_test/CMakeLists.txt      |   19 +-
 lib/kokkos/core/unit_test/Makefile            |   23 +-
 lib/kokkos/core/unit_test/TestAggregate.hpp   |  661 ----
 .../core/unit_test/TestAggregateReduction.hpp |   14 +-
 .../core/unit_test/TestAllocationTracker.cpp  |  159 -
 lib/kokkos/core/unit_test/TestAtomic.hpp      |   11 +-
 .../core/unit_test/TestAtomicOperations.hpp   |  841 +++++
 lib/kokkos/core/unit_test/TestCuda.cpp        |  361 +-
 lib/kokkos/core/unit_test/TestCuda_a.cpp      |  443 +--
 lib/kokkos/core/unit_test/TestCuda_b.cpp      |  486 +--
 lib/kokkos/core/unit_test/TestCuda_c.cpp      |  480 +--
 ...Init.cpp => TestDefaultDeviceTypeInit.hpp} |   35 +-
 .../unit_test/TestDefaultDeviceTypeInit_1.cpp |    2 +
 .../TestDefaultDeviceTypeInit_10.cpp          |    2 +
 .../TestDefaultDeviceTypeInit_11.cpp          |    2 +
 .../TestDefaultDeviceTypeInit_12.cpp          |    2 +
 .../TestDefaultDeviceTypeInit_13.cpp          |    2 +
 .../TestDefaultDeviceTypeInit_14.cpp          |    2 +
 .../TestDefaultDeviceTypeInit_15.cpp          |    2 +
 .../TestDefaultDeviceTypeInit_16.cpp          |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_2.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_3.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_4.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_5.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_6.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_7.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_8.cpp |    2 +
 .../unit_test/TestDefaultDeviceTypeInit_9.cpp |    2 +
 .../TestDefaultDeviceType_a.cpp}              |   42 +-
 lib/kokkos/core/unit_test/TestMDRange.hpp     |  555 +++
 lib/kokkos/core/unit_test/TestMemoryPool.hpp  |  558 ++-
 lib/kokkos/core/unit_test/TestOpenMP.cpp      |   78 +
 lib/kokkos/core/unit_test/TestOpenMP_a.cpp    |   23 +-
 lib/kokkos/core/unit_test/TestOpenMP_b.cpp    |   54 +-
 lib/kokkos/core/unit_test/TestOpenMP_c.cpp    |   62 +-
 .../core/unit_test/TestPolicyConstruction.hpp |   26 +-
 lib/kokkos/core/unit_test/TestQthread.cpp     |   12 +-
 lib/kokkos/core/unit_test/TestRange.hpp       |    5 +-
 lib/kokkos/core/unit_test/TestReduce.hpp      | 1408 +++++++
 lib/kokkos/core/unit_test/TestSerial.cpp      |  131 +-
 lib/kokkos/core/unit_test/TestTaskPolicy.hpp  |  516 ++-
 lib/kokkos/core/unit_test/TestTeam.hpp        |  291 +-
 lib/kokkos/core/unit_test/TestThreads.cpp     |  120 +-
 lib/kokkos/core/unit_test/TestViewAPI.hpp     |    4 +-
 lib/kokkos/example/fenl/CGSolve.hpp           |    4 +-
 lib/kokkos/example/fenl/fenl_functors.hpp     |    2 +-
 lib/kokkos/example/fenl/fenl_impl.hpp         |    2 +-
 lib/kokkos/example/global_2_local_ids/G2L.hpp |    2 +-
 .../example_chol_performance_device.hpp       |    2 +-
 lib/kokkos/example/md_skeleton/main.cpp       |    4 +-
 lib/kokkos/example/multi_fem/Explicit.hpp     |    2 +-
 lib/kokkos/example/multi_fem/Implicit.hpp     |    2 +-
 lib/kokkos/example/multi_fem/Nonlinear.hpp    |    2 +-
 .../example/multi_fem/SparseLinearSystem.hpp  |    2 +-
 lib/kokkos/example/sort_array/CMakeLists.txt  |    1 -
 lib/kokkos/example/sort_array/sort_array.hpp  |    2 +-
 .../example/tutorial/01_hello_world/Makefile  |    2 +-
 .../tutorial/01_hello_world_lambda/Makefile   |    2 +-
 .../tutorial/02_simple_reduce/Makefile        |    2 +-
 .../tutorial/02_simple_reduce_lambda/Makefile |    2 +-
 .../example/tutorial/03_simple_view/Makefile  |    2 +-
 .../tutorial/03_simple_view_lambda/Makefile   |    2 +-
 .../tutorial/04_simple_memoryspaces/Makefile  |    2 +-
 .../tutorial/05_simple_atomics/Makefile       |    2 +-
 .../Advanced_Views/01_data_layouts/Makefile   |    2 +-
 .../01_data_layouts/data_layouts.cpp          |    4 +-
 .../Advanced_Views/02_memory_traits/Makefile  |    2 +-
 .../02_memory_traits/memory_traits.cpp        |    4 +-
 .../Advanced_Views/03_subviews/Makefile       |    2 +-
 .../Advanced_Views/04_dualviews/Makefile      |    2 +-
 .../Advanced_Views/04_dualviews/dual_view.cpp |   10 +-
 .../Advanced_Views/05_NVIDIA_UVM/Makefile     |    2 +-
 .../05_NVIDIA_UVM/uvm_example.cpp             |    2 +-
 .../Advanced_Views/06_AtomicViews/Makefile    |    2 +-
 .../07_Overlapping_DeepCopy/Makefile          |    2 +-
 .../overlapping_deepcopy.cpp                  |    2 +-
 .../Algorithms/01_random_numbers/Makefile     |    2 +-
 .../01_random_numbers/random_numbers.cpp      |    2 +-
 .../01_thread_teams/Makefile                  |    2 +-
 .../01_thread_teams_lambda/Makefile           |    3 +-
 .../thread_teams_lambda.cpp                   |    3 +-
 .../02_nested_parallel_for/Makefile           |    2 +-
 .../03_vectorization/Makefile                 |    2 +-
 .../03_vectorization/vectorization.cpp        |    6 +-
 .../04_team_scan/Makefile                     |    2 +-
 .../04_team_scan/team_scan.cpp                |    2 +-
 lib/kokkos/generate_makefile.bash             |    2 +-
 src/MANYBODY/pair_vashishta.cpp               |    2 +-
 src/MANYBODY/pair_vashishta.h                 |   14 +-
 src/fix_nve_sphere.cpp                        |   31 +-
 src/group.cpp                                 |  113 +-
 212 files changed, 18620 insertions(+), 13184 deletions(-)
 create mode 100644 lib/kokkos/cmake/deps/CUDA.cmake
 create mode 100644 lib/kokkos/cmake/deps/CUSPARSE.cmake
 create mode 100644 lib/kokkos/cmake/deps/HWLOC.cmake
 create mode 100644 lib/kokkos/cmake/deps/Pthread.cmake
 create mode 100644 lib/kokkos/cmake/deps/QTHREAD.cmake
 create mode 100644 lib/kokkos/cmake/tribits.cmake
 create mode 100644 lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
 create mode 100644 lib/kokkos/config/master_history.txt
 create mode 100644 lib/kokkos/containers/performance_tests/TestDynRankView.hpp
 delete mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
 delete mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
 create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
 create mode 100644 lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
 create mode 100644 lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
 delete mode 100644 lib/kokkos/core/src/KokkosExp_View.hpp
 rename lib/kokkos/core/src/{impl/Kokkos_BasicAllocators.hpp => Kokkos_Concepts.hpp} (56%)
 create mode 100644 lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
 create mode 100644 lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 create mode 100644 lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
 delete mode 100644 lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
 delete mode 100644 lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
 create mode 100644 lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
 delete mode 100644 lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
 create mode 100644 lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
 delete mode 100644 lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp
 create mode 100644 lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
 create mode 100644 lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
 create mode 100644 lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
 create mode 100644 lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
 delete mode 100644 lib/kokkos/core/unit_test/TestAllocationTracker.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestAtomicOperations.hpp
 rename lib/kokkos/core/unit_test/{TestDefaultDeviceTypeInit.cpp => TestDefaultDeviceTypeInit.hpp} (93%)
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp
 create mode 100644 lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp
 rename lib/kokkos/core/{src/impl/Kokkos_MemoryPool.cpp => unit_test/TestDefaultDeviceType_a.cpp} (77%)
 create mode 100644 lib/kokkos/core/unit_test/TestMDRange.hpp

diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
index f45fc8d9fc..1219352f73 100644
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@@ -1,4 +1,15 @@
 
+IF(COMMAND TRIBITS_PACKAGE_DECL)
+  SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
+ELSE()
+  SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
+ENDIF()
+
+IF(NOT KOKKOS_HAS_TRILINOS)
+  CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
+  INCLUDE(cmake/tribits.cmake)
+ENDIF()
+
 #
 # A) Forward delcare the package so that certain options are also defined for
 # subpackages
@@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
 # subpackages as well.
 #
 
-TRIBITS_ADD_DEBUG_OPTION()
+
+
+# mfh 01 Aug 2016: See Issue #61:
+#
+# https://github.com/kokkos/kokkos/issues/61
+#
+# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
+# HAVE_KOKKOS_DEBUG.  We define KOKKOS_HAVE_DEBUG here instead,
+# for compatibility with Kokkos' Makefile build system.
+
+TRIBITS_ADD_OPTION_AND_DEFINE(
+  ${PACKAGE_NAME}_ENABLE_DEBUG
+  ${PACKAGE_NAME_UC}_HAVE_DEBUG
+  "Enable run-time debug checks.  These checks may be expensive, so they are disabled by default in a release build."
+  ${${PROJECT_NAME}_ENABLE_DEBUG}
+)
 
 TRIBITS_ADD_OPTION_AND_DEFINE(
   Kokkos_ENABLE_SIERRA_BUILD
@@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
   "${TPL_ENABLE_MPI}"
   )
 
+# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
+#
+# CMake is case sensitive.  The Kokkos_ENABLE_Debug_Bounds_Check
+# option (defined below) is annoyingly not all caps, but we need to
+# keep it that way for backwards compatibility.  If users forget and
+# try using an all-caps variable, then make it count by using the
+# all-caps version as the default value of the original, not-all-caps
+# option.  Otherwise, the default value of this option comes from
+# Kokkos_ENABLE_DEBUG (see Issue #367).
+
+ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
+IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
+  IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
+    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
+  ELSE()
+    SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
+  ENDIF()
+ELSE()
+  SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
+ENDIF()
+ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
+
 TRIBITS_ADD_OPTION_AND_DEFINE(
   Kokkos_ENABLE_Debug_Bounds_Check
   KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-  "Enable bounds checking support in Kokkos."
-  OFF
+  "Enable Kokkos::View run-time bounds checking."
+  "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
   )
 
 TRIBITS_ADD_OPTION_AND_DEFINE(
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index c01ceaf64d..c9b6cc464d 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS)
 #Options: OpenMP,Serial,Pthreads,Cuda
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
-#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
+#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
 KOKKOS_ARCH ?= ""
 #Options: yes,no
 KOKKOS_DEBUG ?= "no"
@@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
 
 #NVIDIA based
@@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM
 
 #Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
-KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
+KOKKOS_INTERNAL_USE_ARCH_AVX2      := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 
 # Decide what ISA level we are able to support
-KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
+KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
 
@@ -304,8 +308,8 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -mcpu=power8
-	KOKKOS_LDFLAGS  += -mcpu=power8
+	KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+	KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
@@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
 
 			else
 				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=core-avx2
-				KOKKOS_LDFLAGS  += -march=core-avx2
+				KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
+				KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
 			endif
 		endif
 	endif
@@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
     tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
 	KOKKOS_CXXFLAGS += -arch=sm_53
 endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
+    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
+        KOKKOS_CXXFLAGS += -arch=sm_61
+endif
 endif
  
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index 876ae033b7..86929ea0fe 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -1,9 +1,5 @@
 Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
-Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
-Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
 Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
 Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
@@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
 Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
 Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
@@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
 Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
 endif
@@ -61,6 +61,8 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 endif
 
 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
diff --git a/lib/kokkos/README b/lib/kokkos/README
index 25b3778d95..b094578af6 100644
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov
 ====Requirements============================================================
 ============================================================================
 
-Primary tested compilers are:
+Primary tested compilers on X86 are:
   GCC 4.7.2
   GCC 4.8.4
   GCC 4.9.2
@@ -48,26 +48,43 @@ Primary tested compilers are:
   Clang 3.5.2
   Clang 3.6.1
 
+Primary tested compilers on Power 8 are:
+  IBM XL 13.1.3 (OpenMP,Serial)
+  GCC 4.9.2 (OpenMP,Serial)
+  GCC 5.3.0 (OpenMP,Serial)
+
 Secondary tested compilers are:
   CUDA 6.5 (with gcc 4.7.2)
   CUDA 7.0 (with gcc 4.7.2)
   CUDA 7.5 (with gcc 4.8.4)
 
 Other compilers working:
-  PGI 15.4
-  IBM XL 13.1.2
-  Cygwin 2.1.0 64bit with gcc 4.9.3
+  X86:
+   Intel 17.0.042 (the FENL example causes internal compiler error)
+   PGI 15.4
+   Cygwin 2.1.0 64bit with gcc 4.9.3
+  KNL:
+   Intel 16.2.181 (the FENL example causes internal compiler error)
+   Intel 17.0.042 (the FENL example causes internal compiler error)
+
+Known non-working combinations:
+  Power8:
+   GCC 6.1.0
+   Pthreads backend
+
 
 Primary tested compiler are passing in release mode
-with warnings as errors. We are using the following set
-of flags:
+with warnings as errors. They also are tested with a comprehensive set of 
+backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
+We are using the following set of flags:
 GCC:   -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
        -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
 Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
 Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
 
 Secondary compilers are passing without -Werror.
-Other compilers are tested occasionally.
+Other compilers are tested occasionally, in particular when pushing from develop to 
+master branch, without -Werror and only for a select set of backends.
 
 ============================================================================
 ====Getting started=========================================================
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 192b1d64f8..d7c06dc14b 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -771,6 +771,7 @@ namespace Kokkos {
     friend class Random_XorShift1024_Pool<DeviceType>;
   public:
 
+    typedef Random_XorShift1024_Pool<DeviceType> pool_type;
     typedef DeviceType device_type;
 
     enum {MAX_URAND = 0xffffffffU};
@@ -779,10 +780,10 @@ namespace Kokkos {
     enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
 
     KOKKOS_INLINE_FUNCTION
-    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
       p_(p),state_idx_(state_idx){
       for(int i=0 ; i<16; i++)
-        state_[i] = state[i];
+        state_[i] = state(state_idx,i);
     }
 
     KOKKOS_INLINE_FUNCTION
@@ -933,6 +934,7 @@ namespace Kokkos {
     state_data_type state_;
     int_view_type p_;
     int num_states_;
+    friend class Random_XorShift1024<DeviceType>;
 
   public:
     typedef Random_XorShift1024<DeviceType> generator_type;
@@ -1001,7 +1003,7 @@ namespace Kokkos {
     KOKKOS_INLINE_FUNCTION
     Random_XorShift1024<DeviceType> get_state() const {
       const int i = DeviceType::hardware_thread_id();
-      return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i);
+      return Random_XorShift1024<DeviceType>(state_,p_(i),i);
     };
 
     KOKKOS_INLINE_FUNCTION
@@ -1020,10 +1022,12 @@ namespace Kokkos {
     int p_;
     const int state_idx_;
     uint64_t* state_;
+    const int stride_;
     friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
   public:
 
     typedef Kokkos::Cuda device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
 
     enum {MAX_URAND = 0xffffffffU};
     enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
@@ -1031,30 +1035,30 @@ namespace Kokkos {
     enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
 
     KOKKOS_INLINE_FUNCTION
-    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
-      p_(p),state_idx_(state_idx),state_(state){
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
     }
 
     KOKKOS_INLINE_FUNCTION
     uint32_t urand() {
-      uint64_t state_0 = state_[ p_ ];
-      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
       state_1 ^= state_1 << 31;
       state_1 ^= state_1 >> 11;
       state_0 ^= state_0 >> 30;
-      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
       tmp = tmp>>16;
       return static_cast<uint32_t>(tmp&MAX_URAND);
     }
 
     KOKKOS_INLINE_FUNCTION
     uint64_t urand64() {
-      uint64_t state_0 = state_[ p_ ];
-      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
       state_1 ^= state_1 << 31;
       state_1 ^= state_1 >> 11;
       state_0 ^= state_0 >> 30;
-      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
     }
 
     KOKKOS_INLINE_FUNCTION
@@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
       if(i>=num_states_) {i = i_offset;}
   }
 
-  return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i);
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
 #else
-  return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0);
+  return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
 #endif
 }
 
@@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 #endif
 
 
+namespace Impl {
 
-template<class ViewType, class RandomPool, int loops, int rank>
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
 struct fill_random_functor_range;
-template<class ViewType, class RandomPool, int loops, int rank>
+template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
 struct fill_random_functor_begin_end;
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (const IndexType& i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0())
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0()))
         a(idx) = Rand::draw(gen,range);
     }
     rand_pool.free_state(gen);
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
           a(idx,k) = Rand::draw(gen,range);
       }
     }
@@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
 };
 
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
             a(idx,k,l) = Rand::draw(gen,range);
       }
     }
@@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
               a(idx,k,l,m) = Rand::draw(gen,range);
       }
     }
@@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
               a(idx,k,l,m,n) = Rand::draw(gen,range);
       }
     }
@@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
               a(idx,k,l,m,n,o) = Rand::draw(gen,range);
       }
     }
@@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
               a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
       }
     }
@@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
     a(a_),rand_pool(rand_pool_),range(range_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
-                    for(unsigned int q=0;q<a.dimension_7();q++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
               a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
       }
     }
     rand_pool.free_state(gen);
   }
 };
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0())
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0()))
         a(idx) = Rand::draw(gen,begin,end);
     }
     rand_pool.free_state(gen);
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
           a(idx,k) = Rand::draw(gen,begin,end);
       }
     }
@@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
 };
 
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
             a(idx,k,l) = Rand::draw(gen,begin,end);
       }
     }
@@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
               a(idx,k,l,m) = Rand::draw(gen,begin,end);
       }
     }
@@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()){
-        for(unsigned int l=0;l<a.dimension_1();l++)
-          for(unsigned int m=0;m<a.dimension_2();m++)
-            for(unsigned int n=0;n<a.dimension_3();n++)
-              for(unsigned int o=0;o<a.dimension_4();o++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())){
+        for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
+          for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
+            for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
+              for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
           a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
       }
     }
@@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
           a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
       }
     }
@@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
 };
 
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
             a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
       }
     }
@@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
   }
 };
 
-template<class ViewType, class RandomPool, int loops>
-struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
+template<class ViewType, class RandomPool, int loops, class IndexType>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
   typedef typename ViewType::execution_space execution_space;
   ViewType a;
   RandomPool rand_pool;
@@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
     a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (unsigned int i) const {
+  void operator() (IndexType i) const {
     typename RandomPool::generator_type gen = rand_pool.get_state();
-    for(unsigned int j=0;j<loops;j++) {
-      const uint64_t idx = i*loops+j;
-      if(idx<a.dimension_0()) {
-        for(unsigned int k=0;k<a.dimension_1();k++)
-          for(unsigned int l=0;l<a.dimension_2();l++)
-            for(unsigned int m=0;m<a.dimension_3();m++)
-              for(unsigned int n=0;n<a.dimension_4();n++)
-                for(unsigned int o=0;o<a.dimension_5();o++)
-                  for(unsigned int p=0;p<a.dimension_6();p++)
-                    for(unsigned int q=0;q<a.dimension_7();q++)
+    for(IndexType j=0;j<loops;j++) {
+      const IndexType idx = i*loops+j;
+      if(idx<static_cast<IndexType>(a.dimension_0())) {
+        for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
+          for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
+            for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
+              for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
+                for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
+                  for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
+                    for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
               a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
       }
     }
@@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
   }
 };
 
-template<class ViewType, class RandomPool>
+}
+
+template<class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
   int64_t LDA = a.dimension_0();
   if(LDA>0)
-    parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range));
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
 }
 
-template<class ViewType, class RandomPool>
+template<class ViewType, class RandomPool, class IndexType = int64_t>
 void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
   int64_t LDA = a.dimension_0();
   if(LDA>0)
-    parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end));
+    parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
 }
 }
 
diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
index eade74ed93..c906b9f2cd 100644
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -50,6 +50,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
+#include <chrono>
 
 namespace Test {
 
@@ -207,7 +208,6 @@ struct test_histogram1d_functor {
     density_1d (d1d),
     mean (1.0*num_draws/HIST_DIM1D*3)
   {
-    printf ("Mean: %e\n", mean);
   }
 
   KOKKOS_INLINE_FUNCTION void
@@ -295,7 +295,7 @@ struct test_random_scalar {
       parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
 
       //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
-      double tolerance = 2.0*sqrt(1.0/num_draws);
+      double tolerance = 1.6*sqrt(1.0/num_draws);
       double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
       double variance_expect = 1.0/3.0*mean_expect*mean_expect;
       double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
@@ -303,10 +303,10 @@ struct test_random_scalar {
       double covariance_eps = result.covariance/num_draws/2/variance_expect;
       pass_mean  = ((-tolerance < mean_eps) &&
                     ( tolerance > mean_eps)) ? 1:0;
-      pass_var   = ((-tolerance < variance_eps) &&
-                    ( tolerance > variance_eps)) ? 1:0;
-      pass_covar = ((-1.4*tolerance < covariance_eps) &&
-                    ( 1.4*tolerance > covariance_eps)) ? 1:0;
+      pass_var   = ((-1.5*tolerance < variance_eps) &&
+                    ( 1.5*tolerance > variance_eps)) ? 1:0;
+      pass_covar = ((-2.0*tolerance < covariance_eps) &&
+                    ( 2.0*tolerance > covariance_eps)) ? 1:0;
       cerr << "Pass: " << pass_mean
            << " " << pass_var
            << " " << mean_eps
@@ -328,12 +328,12 @@ struct test_random_scalar {
       double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
       double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
       double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
-      pass_hist1d_mean  = ((-tolerance < mean_eps) &&
-                           ( tolerance > mean_eps)) ? 1:0;
-      pass_hist1d_var   = ((-tolerance < variance_eps) &&
-                           ( tolerance > variance_eps)) ? 1:0;
-      pass_hist1d_covar = ((-tolerance < covariance_eps) &&
-                           ( tolerance > covariance_eps)) ? 1:0;
+      pass_hist1d_mean  = ((-0.0001 < mean_eps) &&
+                           ( 0.0001 > mean_eps)) ? 1:0;
+      pass_hist1d_var   = ((-0.07 < variance_eps) &&
+                           ( 0.07 > variance_eps)) ? 1:0;
+      pass_hist1d_covar = ((-0.06 < covariance_eps) &&
+                           ( 0.06 > covariance_eps)) ? 1:0;
 
       cerr << "Density 1D: " << mean_eps
            << " " << variance_eps
@@ -363,8 +363,8 @@ struct test_random_scalar {
       double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
       pass_hist3d_mean  = ((-tolerance < mean_eps) &&
                            ( tolerance > mean_eps)) ? 1:0;
-      pass_hist3d_var   = ((-tolerance < variance_eps) &&
-                           ( tolerance > variance_eps)) ? 1:0;
+      pass_hist3d_var   = ((-1.2*tolerance < variance_eps) &&
+                           ( 1.2*tolerance > variance_eps)) ? 1:0;
       pass_hist3d_covar = ((-tolerance < covariance_eps) &&
                            ( tolerance > covariance_eps)) ? 1:0;
 
@@ -386,8 +386,13 @@ void test_random(unsigned int num_draws)
   typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
   typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
 
+
+  uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+  cerr << "Test Seed:" << ticks << endl;
+
+  RandomGenerator pool(ticks);
+
   cerr << "Test Scalar=int" << endl;
-  RandomGenerator pool(31891);
   test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
   ASSERT_EQ( test_int.pass_mean,1);
   ASSERT_EQ( test_int.pass_var,1);
diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake
new file mode 100644
index 0000000000..801c20067b
--- /dev/null
+++ b/lib/kokkos/cmake/deps/CUDA.cmake
@@ -0,0 +1,79 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+# Check for CUDA support
+
+SET(_CUDA_FAILURE OFF)
+
+# Have CMake find CUDA
+IF(NOT _CUDA_FAILURE)
+  FIND_PACKAGE(CUDA 3.2)
+  IF (NOT CUDA_FOUND)
+    SET(_CUDA_FAILURE ON)
+  ENDIF()
+ENDIF()
+
+IF(NOT _CUDA_FAILURE)
+  # if we haven't met failure
+  macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
+    TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
+  endmacro()
+  GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
+  GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ELSE()
+  SET(TPL_ENABLE_CUDA OFF)
+ENDIF()
diff --git a/lib/kokkos/cmake/deps/CUSPARSE.cmake b/lib/kokkos/cmake/deps/CUSPARSE.cmake
new file mode 100644
index 0000000000..205f5e2a98
--- /dev/null
+++ b/lib/kokkos/cmake/deps/CUSPARSE.cmake
@@ -0,0 +1,64 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
+
+IF (TPL_ENABLE_CUDA)
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
+  GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
+  GLOBAL_SET(TPL_CUSPARSE_LIBRARIES    ${CUDA_cusparse_LIBRARY})
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
+ENDIF()
+
diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake
new file mode 100644
index 0000000000..275abd3a5d
--- /dev/null
+++ b/lib/kokkos/cmake/deps/HWLOC.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  November 2011
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        http://www.open-mpi.org/projects/hwloc/
+#    Version:       1.3
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
+  REQUIRED_HEADERS hwloc.h
+  REQUIRED_LIBS_NAMES "hwloc"
+  )
diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake
new file mode 100644
index 0000000000..46d0a939ca
--- /dev/null
+++ b/lib/kokkos/cmake/deps/Pthread.cmake
@@ -0,0 +1,83 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+SET(USE_THREADS FALSE)
+
+IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
+  # Use CMake's Thread finder since it is a bit smarter in determining
+  # whether pthreads is already built into the compiler and doesn't need
+  # a library to link.
+  FIND_PACKAGE(Threads)
+  #If Threads found a copy of pthreads make sure it is one of the cases the tribits
+  #tpl system cannot handle.
+  IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
+    IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
+      SET(USE_THREADS TRUE)
+    ENDIF()
+  ENDIF()
+ENDIF()
+
+IF(USE_THREADS)
+  SET(TPL_Pthread_INCLUDE_DIRS "")
+  SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
+  SET(TPL_Pthread_LIBRARY_DIRS "")
+  TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
+ELSE()
+  TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
+    REQUIRED_HEADERS pthread.h
+    REQUIRED_LIBS_NAMES pthread
+      )
+ENDIF()
diff --git a/lib/kokkos/cmake/deps/QTHREAD.cmake b/lib/kokkos/cmake/deps/QTHREAD.cmake
new file mode 100644
index 0000000000..994b72b200
--- /dev/null
+++ b/lib/kokkos/cmake/deps/QTHREAD.cmake
@@ -0,0 +1,70 @@
+# @HEADER
+# ************************************************************************
+#
+#            Trilinos: An Object-Oriented Solver Framework
+#                 Copyright (2001) Sandia Corporation
+#
+#
+# Copyright (2001) Sandia Corporation. Under the terms of Contract
+# DE-AC04-94AL85000, there is a non-exclusive license for use of this
+# work by or on behalf of the U.S. Government.  Export of this program
+# may require a license from the United States Government.
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# NOTICE:  The United States Government is granted for itself and others
+# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
+# license in this data to reproduce, prepare derivative works, and
+# perform publicly and display publicly.  Beginning five (5) years from
+# July 25, 2001, the United States Government is granted for itself and
+# others acting on its behalf a paid-up, nonexclusive, irrevocable
+# worldwide license in this data to reproduce, prepare derivative works,
+# distribute copies to the public, perform publicly and display
+# publicly, and to permit others to do so.
+#
+# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
+# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
+# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
+# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
+# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
+# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
+#
+# ************************************************************************
+# @HEADER
+
+
+#-----------------------------------------------------------------------------
+#  Hardware locality detection and control library.
+#
+#  Acquisition information:
+#    Date checked:  July 2014
+#    Checked by:    H. Carter Edwards <hcedwar AT sandia.gov>
+#    Source:        https://code.google.com/p/qthreads
+#
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+  REQUIRED_HEADERS qthread.h
+  REQUIRED_LIBS_NAMES "qthread"
+  )
+
diff --git a/lib/kokkos/cmake/tribits.cmake b/lib/kokkos/cmake/tribits.cmake
new file mode 100644
index 0000000000..34cd216f81
--- /dev/null
+++ b/lib/kokkos/cmake/tribits.cmake
@@ -0,0 +1,485 @@
+INCLUDE(CMakeParseArguments)
+INCLUDE(CTest)
+
+FUNCTION(ASSERT_DEFINED VARS)
+  FOREACH(VAR ${VARS})
+    IF(NOT DEFINED ${VAR})
+      MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
+    ENDIF()
+  ENDFOREACH()
+ENDFUNCTION()
+
+MACRO(GLOBAL_SET VARNAME)
+  SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
+ENDMACRO()
+
+MACRO(PREPEND_GLOBAL_SET VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
+ENDMACRO()
+
+FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
+  ASSERT_DEFINED(${VARNAME})
+  IF (${VARNAME})
+    SET(TMP ${${VARNAME}})
+    LIST(REMOVE_DUPLICATES TMP)
+    GLOBAL_SET(${VARNAME} ${TMP})
+  ENDIF()
+ENDFUNCTION()
+
+MACRO(TRIBITS_ADD_OPTION_AND_DEFINE  USER_OPTION_NAME  MACRO_DEFINE_NAME DOCSTRING  DEFAULT_VALUE)
+  MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
+  SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
+  IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
+    IF(${USER_OPTION_NAME})
+      GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
+    ELSE()
+      GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+FUNCTION(TRIBITS_CONFIGURE_FILE  PACKAGE_NAME_CONFIG_FILE)
+
+  # Configure the file
+  CONFIGURE_FILE(
+    ${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
+    )
+
+ENDFUNCTION()
+
+MACRO(TRIBITS_ADD_DEBUG_OPTION)
+  TRIBITS_ADD_OPTION_AND_DEFINE(
+    ${PROJECT_NAME}_ENABLE_DEBUG
+    HAVE_${PROJECT_NAME_UC}_DEBUG
+    "Enable a host of runtime debug checking."
+    OFF
+    )
+ENDMACRO()
+
+
+MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
+  FOREACH(TEST_DIR ${ARGN})
+    ADD_SUBDIRECTORY(${TEST_DIR})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
+
+  IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
+    FOREACH(EXAMPLE_DIR ${ARGN})
+      ADD_SUBDIRECTORY(${EXAMPLE_DIR})
+    ENDFOREACH()
+  ENDIF()
+
+ENDMACRO()
+
+MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
+  SET(PROP_VALUES)
+  FOREACH(TARGET_X ${ARGN})
+    LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
+  ENDFOREACH()
+  SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
+ENDMACRO()
+
+MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
+  ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
+  SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
+ENDMACRO()
+
+# Older versions of cmake does not make include directories transitive
+MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
+  TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
+  FOREACH(DEP_LIB ${ARGN})
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
+    TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
+  ENDFOREACH()
+ENDMACRO()
+
+FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
+
+  SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
+  SET(oneValueArgs)
+  SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF(PARSE_HEADERS)
+    LIST(REMOVE_DUPLICATES PARSE_HEADERS)
+  ENDIF()
+  IF(PARSE_SOURCES)
+    LIST(REMOVE_DUPLICATES PARSE_SOURCES)
+  ENDIF()
+
+  # Local variable to hold all of the libraries that will be directly linked
+  # to this library.
+  SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
+
+  # Add dependent libraries passed directly in
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  IF (PARSE_DEPLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
+  ENDIF()
+
+  # Add the library and all the dependencies
+
+  IF (PARSE_DEFINES)
+    ADD_DEFINITIONS(${PARSE_DEFINES})
+  ENDIF()
+
+  IF (PARSE_STATIC)
+    SET(STATIC_KEYWORD "STATIC")
+  ELSE()
+    SET(STATIC_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_SHARED)
+    SET(SHARED_KEYWORD "SHARED")
+  ELSE()
+    SET(SHARED_KEYWORD)
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  IF (NOT PARSE_CUDALIBRARY)
+    ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${STATIC_KEYWORD}
+      ${SHARED_KEYWORD}
+      ${EXCLUDE_FROM_ALL_KEYWORD}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ELSE()
+    CUDA_ADD_LIBRARY(
+      ${LIBRARY_NAME}
+      ${PARSE_HEADERS}
+      ${PARSE_NOINSTALLHEADERS}
+      ${PARSE_SOURCES}
+      )
+  ENDIF()
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
+
+  IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
+
+    INSTALL(
+      TARGETS ${LIBRARY_NAME}
+      EXPORT ${PROJECT_NAME}
+      RUNTIME DESTINATION bin
+      LIBRARY DESTINATION lib
+      ARCHIVE DESTINATION lib
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+    INSTALL(
+      FILES  ${PARSE_HEADERS}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+      INSTALL(
+      DIRECTORY  ${PARSE_HEADERS_INSTALL_SUBDIR}
+      EXPORT ${PROJECT_NAME}
+      DESTINATION include
+      COMPONENT ${PACKAGE_NAME}
+      )
+
+  ENDIF()
+
+  IF (NOT PARSE_TESTONLY)
+    PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
+    REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
+  ENDIF()
+
+ENDFUNCTION()
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
+
+  SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
+  SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  IF (PARSE_TARGET_DEFINES)
+    TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
+  ENDIF()
+
+  SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
+
+  IF (PARSE_TESTONLYLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
+  ENDIF()
+
+  IF (PARSE_IMPORTEDLIBS)
+    LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
+  ENDIF()
+
+  SET (EXE_SOURCES)
+  IF(PARSE_DIRECTORY)
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      IF(IS_ABSOLUTE ${SOURCE_FILE})
+        SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+      ELSE()
+        SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
+      ENDIF()
+    ENDFOREACH( )
+  ELSE()
+    FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
+      SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
+    ENDFOREACH( )
+  ENDIF()
+
+  SET(EXE_BINARY_NAME ${EXE_NAME})
+  IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
+    SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
+  ENDIF()
+
+  IF (PARSE_TESTONLY)
+    SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
+  ELSE()
+    SET(EXCLUDE_FROM_ALL_KEYWORD)
+  ENDIF()
+  ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
+
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_INSTALLABLE)
+    INSTALL(
+      TARGETS ${EXE_BINARY_NAME}
+      EXPORT ${PROJECT_NAME}
+        DESTINATION bin
+    )
+  ENDIF()
+ENDFUNCTION()
+
+ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
+
+FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
+
+  SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
+  SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
+  SET(multiValueArgs)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
+
+  IF(WIN32)
+    ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
+  ELSE()
+    ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+  ENDIF()
+  ADD_DEPENDENCIES(check ${TEST_NAME})
+
+  IF(PARSE_FAIL_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_PASS_REGULAR_EXPRESSION)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
+  ENDIF()
+
+  IF(PARSE_WILL_FAIL)
+    SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
+  ENDIF()
+
+  IF(PARSE_ADDED_TESTS_NAMES_OUT)
+    SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+  IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
+    SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
+  ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
+  TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
+  TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
+ENDMACRO()
+
+FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
+
+  SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
+  SET(oneValueArgs)
+  SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
+
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
+  IF (PARSE_REQUIRED_LIBS_NAMES)
+    FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
+    IF(NOT TPL_${TPL_NAME}_LIBRARIES)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+  IF (PARSE_REQUIRED_HEADERS)
+    FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
+    IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
+      SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
+    ENDIF()
+  ENDIF()
+
+
+  IF (_${TPL_NAME}_ENABLE_SUCCESS)
+    TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
+  ENDIF()
+
+ENDFUNCTION()
+
+MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
+  GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
+  INCLUDE("${TPL_FILE}")
+  IF(TARGET TPL_LIB_${TPL_NAME})
+    MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
+    SET(TPL_ENABLE_${TPL_NAME} TRUE)
+  ELSE()
+    MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
+    SET(TPL_ENABLE_${TPL_NAME} FALSE)
+  ENDIF()
+ENDMACRO()
+
+MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
+  IF(TYPE STREQUAL "REQUIRED")
+    SET(REQUIRED TRUE)
+  ELSE()
+    SET(REQUIRED FALSE)
+  ENDIF()
+  IF(TARGET ${TARGET_NAME})
+    PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
+  ELSE()
+    IF(REQUIRED)
+      MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
+    ENDIF()
+  ENDIF()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
+  FOREACH(DEP ${ARGN})
+    PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_ENABLE_TPLS)
+  FOREACH(TPL ${ARGN})
+    IF(TARGET ${TPL})
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
+    ELSE()
+      GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
+    ENDIF()
+  ENDFOREACH()
+ENDMACRO()
+
+MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
+
+  SET(options)
+  SET(oneValueArgs)
+  SET(multiValueArgs 
+    LIB_REQUIRED_PACKAGES
+    LIB_OPTIONAL_PACKAGES
+    TEST_REQUIRED_PACKAGES
+    TEST_OPTIONAL_PACKAGES
+    LIB_REQUIRED_TPLS
+    LIB_OPTIONAL_TPLS
+    TEST_REQUIRED_TPLS
+    TEST_OPTIONAL_TPLS
+    REGRESSION_EMAIL_LIST
+    SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
+  )
+  CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
+
+  GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
+  TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
+  TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
+
+  TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
+
+ENDMACRO()
+
+MACRO(TRIBITS_SUBPACKAGE NAME)
+  SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
+  SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
+
+  GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
+
+  INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
+
+ENDMACRO(TRIBITS_SUBPACKAGE)
+
+MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+  TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
+ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
+
+MACRO(TRIBITS_PACKAGE_DECL NAME)
+
+  PROJECT(${NAME})
+  STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
+  SET(PACKAGE_NAME ${PROJECT_NAME})
+  STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
+
+  SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
+  FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
+  FOREACH(TPL_FILE ${TPLS_FILES})
+    TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
+  ENDFOREACH()
+
+ENDMACRO()
+
+
+MACRO(TRIBITS_PROCESS_SUBPACKAGES)
+  FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
+  FOREACH(SUBPACKAGE ${SUBPACKAGES})
+    GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
+    GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
+    ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
+  ENDFOREACH()
+ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
+
+MACRO(TRIBITS_PACKAGE_DEF)
+ENDMACRO(TRIBITS_PACKAGE_DEF)
+
+MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
+
+MACRO(TRIBITS_EXCLUDE_FILES)
+ENDMACRO(TRIBITS_EXCLUDE_FILES)
+
+MACRO(TRIBITS_PACKAGE_POSTPROCESS)
+ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)
+
diff --git a/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
new file mode 100644
index 0000000000..9f56f2fd48
--- /dev/null
+++ b/lib/kokkos/config/kokkos-trilinos-integration-procedure.txt
@@ -0,0 +1,153 @@
+// -------------------------------------------------------------------------------- //
+
+The following steps are for workstations/servers with the SEMS environment installed.
+
+// -------------------------------------------------------------------------------- //
+Summary:
+
+- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
+
+- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
+
+- Step 3: Build and test Trilinos with combinations of compilers, types, backends.
+
+- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
+
+- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
+// -------------------------------------------------------------------------------- //
+
+
+// -------------------------------------------------------------------------------- //
+
+Step 1:
+  1.1. Update kokkos develop branch (NOT a fork)
+
+         (From kokkos directory):
+         git fetch --all
+         git checkout develop
+         git reset --hard origin/develop
+
+  1.2. Create a testing directory - here the directory is created within the kokkos directory
+
+         mkdir testing
+         cd testing
+
+  1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
+
+         ../config/test_all_sandia
+
+  1.4 Clean repository of untracked files
+
+        cd ../
+        git clean -df
+
+// -------------------------------------------------------------------------------- //
+
+Step 2:
+  2.1 Update Trilinos develop branch
+
+        (From Trilinos directory):
+        git checkout develop
+        git fetch --all
+        git reset --hard origin/develop
+        git clean -df
+
+  2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
+
+        module load python/2.7.9
+        python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+
+// -------------------------------------------------------------------------------- //
+
+Step 3:
+  3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
+
+      - GCC/4.7.2-OpenMP/Complex
+          Run tests with the following environment variable:
+
+            export OMP_NUM_THREADS=2
+
+
+      - Intel/15.0.2-Serial/NoComplex
+
+
+      - GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
+          Run tests with the following environment variables:
+
+            export CUDA_LAUNCH_BLOCKING=1
+            export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
+
+
+        mkdir Build
+        cd Build
+        cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
+            ** Set the path to Trilinos appropriately within the configure-all script **
+        source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
+        source configure-all
+        make -k  (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
+        ctest
+
+  3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
+
+// -------------------------------------------------------------------------------- //
+
+Step 4:
+  4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
+
+       - DO NOT fast-forward the merge!!!!
+
+       (From kokkos directory):
+       git checkout master
+       git fetch --all
+       # Ensure we are on the current origin/master
+       git reset --hard origin/master
+       git merge --no-ff origin/develop
+
+  4.2. Update the tag in kokkos/config/master_history.txt
+       Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
+       Tag format: #.#.##
+
+       # Prepend master_history.txt with 
+       
+       # tag: #.#.##
+       # date: mm/dd/yyyy
+       # master: sha1
+       # develop: sha1
+       # -----------------------
+
+       git commit --amend -a
+
+       git tag -a #.#.##
+         tag: #.#.##
+         date: mm/dd/yyyy
+         master: sha1
+         develop: sha1
+
+       git push --follow-tags origin master
+
+// -------------------------------------------------------------------------------- //
+
+Step 5:
+  5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
+
+       (From Trilinos directory):
+       git checkout develop
+       git fetch --all
+       git reset --hard origin/develop
+       git clean -df
+
+  5.2. Snapshot Kokkos master branch into Trilinos
+
+       (From kokkos directory):
+       git fetch --all
+       git checkout tags/#.#.##
+       git clean -df
+
+       python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+       
+  5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
+
+       (From Trilinos directory):
+       git push
+
+// -------------------------------------------------------------------------------- //
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
new file mode 100644
index 0000000000..f2eb674578
--- /dev/null
+++ b/lib/kokkos/config/master_history.txt
@@ -0,0 +1,3 @@
+tag:  2.01.00    date: 07:21:2016    master: xxxxxxxx    develop: fa6dfcc4
+tag:  2.01.06    date: 09:02:2016    master: 9afaa87f    develop: 555f1a3a
+
diff --git a/lib/kokkos/config/nvcc_wrapper b/lib/kokkos/config/nvcc_wrapper
index d583866191..6093cb61bd 100755
--- a/lib/kokkos/config/nvcc_wrapper
+++ b/lib/kokkos/config/nvcc_wrapper
@@ -1,17 +1,12 @@
 #!/bin/bash
 #
 # This shell script (nvcc_wrapper) wraps both the host compiler and
-# NVCC, if you are building Trilinos with CUDA enabled.  The script
-# remedies some differences between the interface of NVCC and that of
-# the host compiler, in particular for linking.  It also means that
-# Trilinos doesn't need separate .cu files; it can just use .cpp
-# files.
+# NVCC, if you are building legacy C or C++ code with CUDA enabled.
+# The script remedies some differences between the interface of NVCC
+# and that of the host compiler, in particular for linking.
+# It also means that a legacy code doesn't need separate .cu files;
+# it can just use .cpp files.
 #
-# Hopefully, at some point, NVIDIA may fix NVCC so as to make this
-# script obsolete.  For now, this script exists and if you want to
-# build Trilinos with CUDA enabled, you must use this script as your
-# compiler.
-
 # Default settings: change those according to your machine.  For
 # example, you may have have two different wrappers with either icpc
 # or g++ as their back-end compiler.  The defaults can be overwritten
@@ -53,6 +48,10 @@ object_files=""
 # Link objects for the host linker only
 object_files_xlinker=""
 
+# Shared libraries with version numbers are not handled correctly by NVCC
+shared_versioned_libraries_host=""
+shared_versioned_libraries=""
+
 # Does the User set the architecture 
 arch_set=0
 
@@ -76,6 +75,9 @@ first_xcompiler_arg=1
 
 temp_dir=${TMPDIR:-/tmp}
 
+# Check if we have an optimization argument already
+optimization_applied=0
+
 #echo "Arguments: $# $@"
 
 while [ $# -gt 0 ]
@@ -97,8 +99,17 @@ do
   *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
     cpp_files="$cpp_files $1"
     ;;
+   # Ensure we only have one optimization flag because NVCC doesn't allow muliple
+  -O*)
+    if [ $optimization_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
+    else
+       shared_args="$shared_args $1"
+       optimization_applied=1
+    fi
+    ;;
   #Handle shared args (valid for both nvcc and the host compiler)
-  -O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
+  -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
     shared_args="$shared_args $1"
     ;;
   #Handle shared args that have an argument
@@ -107,7 +118,7 @@ do
     shift
     ;;
   #Handle known nvcc args
-  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
+  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
@@ -175,10 +186,15 @@ do
     object_files_xlinker="$object_files_xlinker -Xlinker $1"
     ;;
   #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
-  *.so.*|*.dylib)
+  *.dylib)
     object_files="$object_files -Xlinker $1"
     object_files_xlinker="$object_files_xlinker -Xlinker $1"
     ;;
+  #Handle shared libraries with *.so.* names which nvcc can't do.
+  *.so.*)
+    shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
+    shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
+  ;;
   #All other args are sent to the host compiler
   *)
     if [ $first_xcompiler_arg -eq 1 ]; then
@@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then
 fi
 
 #Compose compilation command
-nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
+nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
 if [ $first_xcompiler_arg -eq 0 ]; then
   nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
 fi
 
 #Compose host only command
-host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
+host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
 
 #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
 if [ $replace_pragma_ident -eq 1 ]; then
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
index add45b77b4..aac036a8f3 100755
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@@ -6,18 +6,132 @@
 
 set -o pipefail
 
+# Determine current machine
+
+MACHINE=""
+HOSTNAME=$(hostname)
+if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
+    MACHINE=white
+elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
+    MACHINE=bowman
+elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
+    MACHINE=shepard
+elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
+    MACHINE=sems
+else
+    echo "Unrecognized machine" >&2
+    exit 1
+fi
+
 GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
+IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
 CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
 
 GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
+IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CUDA_WARNING_FLAGS=""
 
-BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
-CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
+# Default. Machine specific can override
+DEBUG=False
+ARGS=""
+CUSTOM_BUILD_LIST=""
+DRYRUN=False
+BUILD_ONLY=False
+declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
+TEST_SCRIPT=False
+SKIP_HWLOC=False
+
+ARCH_FLAG=""
+
+#
+# Machine specific config
+#
+
+if [ "$MACHINE" = "sems" ]; then
+    source /projects/modulefiles/utils/sems-modules-init.sh
+    source /projects/modulefiles/utils/kokkos-modules-init.sh
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
+    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
+               "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+               "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+    )
+
+elif [ "$MACHINE" = "white" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+    IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
+    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
+
+    # Don't do pthread on white
+    GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+               "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=Power8"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+elif [ "$MACHINE" = "bowman" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=KNL"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+elif [ "$MACHINE" = "shepard" ]; then
+    source /etc/profile.d/modules.sh
+    SKIP_HWLOC=True
+    export SLURM_TASKS_PER_NODE=32
+
+    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+
+    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+
+    # Format: (compiler module-list build-list exe-name warning-flag)
+    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+               "intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+    )
+
+    ARCH_FLAG="--arch=HSW"
+    NUM_JOBS_TO_RUN_IN_PARALLEL=8
+
+else
+    echo "Unhandled machine $MACHINE" >&2
+    exit 1
+fi
 
 export OMP_NUM_THREADS=4
 
@@ -25,23 +139,12 @@ declare -i NUM_RESULTS_TO_KEEP=7
 
 RESULT_ROOT_PREFIX=TestAll
 
-source /projects/modulefiles/utils/sems-modules-init.sh
-source /projects/modulefiles/utils/kokkos-modules-init.sh
-
 SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
 
 #
 # Handle arguments
 #
 
-DEBUG=False
-ARGS=""
-CUSTOM_BUILD_LIST=""
-DRYRUN=False
-BUILD_ONLY=False
-declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
-TEST_SCRIPT=False
-
 while [[ $# > 0 ]]
 do
 key="$1"
@@ -61,6 +164,9 @@ BUILD_ONLY=True
 --test-script*)
 TEST_SCRIPT=True
 ;;
+--skip-hwloc*)
+SKIP_HWLOC=True
+;;
 --num*)
 NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
 ;;
@@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
 echo "    Defaults to root repo containing this script"
 echo "--debug: Run tests in debug. Defaults to False"
 echo "--test-script: Test this script, not Kokkos"
+echo "--skip-hwloc: Do not do hwloc tests"
 echo "--num=N: Number of jobs to run in parallel "
 echo "--dry-run: Just print what would be executed"
 echo "--build-only: Just do builds, don't run anything"
@@ -82,21 +189,16 @@ echo "    Valid items:"
 echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
 echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
 echo ""
+
 echo "ARGS: list of expressions matching compilers to test"
-echo "  supported compilers"
-echo "    gcc/4.7.2"
-echo "    gcc/4.8.4"
-echo "    gcc/4.9.2"
-echo "    gcc/5.1.0"
-echo "    intel/14.0.4"
-echo "    intel/15.0.2"
-echo "    intel/16.0.1"
-echo "    clang/3.5.2"
-echo "    clang/3.6.1"
-echo "    cuda/6.5.14"
-echo "    cuda/7.0.28"
-echo "    cuda/7.5.18"
+echo "  supported compilers sems"
+for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+    echo "    $COMPILER"
+done
 echo ""
+
 echo "Examples:"
 echo "  Run all tests"
 echo "  % test_all_sandia"
@@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then
     ARGS='?'
 fi
 
-# Format: (compiler module-list build-list exe-name warning-flag)
-COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-           "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-           "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-           "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-           "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-           "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-           "cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-           "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-           "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-           )
-
 # Process args to figure out which compilers to test
 COMPILERS_TO_TEST=""
 for ARG in $ARGS; do
@@ -240,18 +327,19 @@ run_cmd() {
     fi
 }
 
-# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
+# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
 report_and_log_test_result() {
     # Use sane var names
-    local success=$1; local desc=$2; local phase=$3;
+    local success=$1; local desc=$2; local comment=$3;
 
     if [ "$success" = "0" ]; then
 	echo "  PASSED $desc"
-        touch $PASSED_DIR/$desc
+        echo $comment > $PASSED_DIR/$desc
     else
+        # For failures, comment should be the name of the phase that failed
 	echo "  FAILED $desc" >&2
-        echo $phase > $FAILED_DIR/$desc
-        cat ${desc}.${phase}.log
+        echo $comment > $FAILED_DIR/$desc
+        cat ${desc}.${comment}.log
     fi
 }
 
@@ -309,6 +397,8 @@ single_build_and_test() {
 
     echo "  Starting job $desc"
 
+    local comment="no_comment"
+
     if [ "$TEST_SCRIPT" = "True" ]; then
         local rand=$[ 1 + $[ RANDOM % 10 ]]
         sleep $rand
@@ -316,14 +406,19 @@ single_build_and_test() {
             run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
         fi
     else
-        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+        local -i build_start_time=$(date +%s)
         run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+        local -i build_end_time=$(date +%s)
+        comment="build_time=$(($build_end_time-$build_start_time))"
         if [[ "$BUILD_ONLY" == False ]]; then
             run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+            local -i run_end_time=$(date +%s)
+            comment="$comment run_time=$(($run_end_time-$build_end_time))"
         fi
     fi
 
-    report_and_log_test_result 0 $desc
+    report_and_log_test_result 0 $desc "$comment"
 
     return 0
 }
@@ -374,7 +469,7 @@ build_and_test_all() {
 	run_in_background $compiler $build $BUILD_TYPE
 
         # If not cuda, do a hwloc test too
-        if [[ "$compiler" != cuda* ]]; then
+        if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
             run_in_background $compiler $build "hwloc-$BUILD_TYPE"
         fi
     done
@@ -401,7 +496,11 @@ wait_summarize_and_exit() {
     echo "PASSED TESTS"
     echo "#######################################################"
 
-    \ls -1 $PASSED_DIR | sort
+    local passed_test
+    for passed_test in $(\ls -1 $PASSED_DIR | sort)
+    do
+        echo $passed_test $(cat $PASSED_DIR/$passed_test)
+    done
 
     echo "#######################################################"
     echo "FAILED TESTS"
@@ -409,7 +508,7 @@ wait_summarize_and_exit() {
 
     local failed_test
     local -i rv=0
-    for failed_test in $(\ls -1 $FAILED_DIR)
+    for failed_test in $(\ls -1 $FAILED_DIR | sort)
     do
         echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
         rv=$rv+1
diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt
index 6b57802935..726d403452 100644
--- a/lib/kokkos/containers/performance_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt
@@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP)
   LIST( APPEND SOURCES TestOpenMP.cpp)
 ENDIF()
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  PerformanceTest
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
   SOURCES ${SOURCES}
   COMM serial mpi
+  TESTONLYLIBS kokkos_gtest
+  )
+
+TRIBITS_ADD_TEST(
+  PerformanceTest
+  NAME PerfTestExec
+  COMM serial mpi
   NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
   FAIL_REGULAR_EXPRESSION "  FAILED  "
-  TESTONLYLIBS kokkos_gtest
   )
diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp
index aee262de93..8183adaa60 100644
--- a/lib/kokkos/containers/performance_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp
@@ -54,6 +54,8 @@
 
 #if defined( KOKKOS_HAVE_CUDA )
 
+#include <TestDynRankView.hpp>
+
 #include <Kokkos_UnorderedMap.hpp>
 
 #include <TestGlobal2LocalIds.hpp>
@@ -77,6 +79,13 @@ protected:
   }
 };
 
+TEST_F( cuda, dynrankview_perf ) 
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
+}
+
 TEST_F( cuda, global_2_local)
 {
   std::cout << "Cuda" << std::endl;
diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
new file mode 100644
index 0000000000..aab6e6988f
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -0,0 +1,265 @@
+
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
+#define KOKKOS_TEST_DYNRANKVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <vector>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// Compare performance of DynRankView to View, specific focus on the parenthesis operators
+
+namespace Performance {
+
+//View functor
+template <typename DeviceType>
+struct InitViewFunctor {
+  typedef Kokkos::View<double***, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::View<double***, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::View<double*, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+template <typename DeviceType>
+struct InitStrideViewFunctor {
+  typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+template <typename DeviceType>
+struct InitViewRank7Functor {
+  typedef Kokkos::View<double*******, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+};
+
+//DynRankView functor
+template <typename DeviceType>
+struct InitDynRankViewFunctor {
+  typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+  inviewtype _inview;
+
+  InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+      for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+        _inview(i,j,k) = i/2 -j*j + k/3;
+      }
+    }
+  }
+
+  struct SumComputationTest
+  {
+    typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
+    inviewtype _inview;
+
+    typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
+    outviewtype _outview;
+
+    KOKKOS_INLINE_FUNCTION
+    SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const int i) const {
+      for (unsigned j = 0; j < _inview.dimension(1); ++j) {
+        for (unsigned k = 0; k < _inview.dimension(2); ++k) {
+          _outview(i) += _inview(i,j,k) ;
+        }
+      }
+    }
+  };
+
+};
+
+
+template <typename DeviceType>
+void test_dynrankview_op_perf( const int par_size )
+{
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
+  const size_type dim2 = 900;
+  const size_type dim3 = 300;
+
+  double elapsed_time_view = 0;
+  double elapsed_time_compview = 0;
+  double elapsed_time_strideview = 0;
+  double elapsed_time_view_rank7 = 0;
+  double elapsed_time_drview = 0;
+  double elapsed_time_compdrview = 0;
+  Kokkos::Timer timer;
+  {
+    Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
+    typedef InitViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view = timer.seconds();
+    std::cout << " View time (init only): " << elapsed_time_view << std::endl;
+
+
+    timer.reset();
+    Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compview = timer.seconds();
+    std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
+
+
+    Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
+    typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
+
+    timer.reset();
+    Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
+    DeviceType::fence();
+    elapsed_time_strideview = timer.seconds();
+    std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
+  }
+  {
+    Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
+    typedef InitViewRank7Functor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testview) );
+    DeviceType::fence();
+    elapsed_time_view_rank7 = timer.seconds();
+    std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
+  }
+  {
+    Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
+    typedef InitDynRankViewFunctor<DeviceType> FunctorType;
+
+    timer.reset();
+    Kokkos::RangePolicy<DeviceType> policy(0,par_size);
+    Kokkos::parallel_for( policy , FunctorType(testdrview) );
+    DeviceType::fence();
+    elapsed_time_drview = timer.seconds();
+    std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
+
+    timer.reset();
+    Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
+    Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
+    DeviceType::fence();
+    elapsed_time_compdrview = timer.seconds();
+    std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
+
+  }
+
+  std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
+  std::cout << " Ratio of View to View Rank7  time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
+  std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
+  std::cout << " Ratio of DynRankView to View Rank7  time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
+
+  timer.reset();
+
+} //end test_dynrankview
+
+
+} //end Performance
+#endif
diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
index fb70b8fe2e..66f1fbf092 100644
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids)
   std::cout << num_ids << ", ";
 
   double elasped_time = 0;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   local_id_view local_2_global("local_ids", num_ids);
   global_id_view global_2_local((3u*num_ids)/2u);
diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
index 82a9311df7..da74d32ac1 100644
--- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@@ -50,6 +50,8 @@
 #include <TestGlobal2LocalIds.hpp>
 #include <TestUnorderedMapPerformance.hpp>
 
+#include <TestDynRankView.hpp>
+
 #include <iomanip>
 #include <sstream>
 #include <string>
@@ -91,6 +93,13 @@ protected:
   }
 };
 
+TEST_F( openmp, dynrankview_perf ) 
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
+}
+
 TEST_F( openmp, global_2_local)
 {
   std::cout << "OpenMP" << std::endl;
diff --git a/lib/kokkos/containers/performance_tests/TestThreads.cpp b/lib/kokkos/containers/performance_tests/TestThreads.cpp
index 04d9dc0c18..4179b7de4c 100644
--- a/lib/kokkos/containers/performance_tests/TestThreads.cpp
+++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp
@@ -52,6 +52,8 @@
 #include <TestGlobal2LocalIds.hpp>
 #include <TestUnorderedMapPerformance.hpp>
 
+#include <TestDynRankView.hpp>
+
 #include <iomanip>
 #include <sstream>
 #include <string>
@@ -85,6 +87,13 @@ protected:
   }
 };
 
+TEST_F( threads, dynrankview_perf ) 
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << " DynRankView vs View: Initialization Only " << std::endl;
+  test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
+}
+
 TEST_F( threads, global_2_local)
 {
   std::cout << "Threads" << std::endl;
diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
index 975800229c..71d1182cbe 100644
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -80,7 +80,7 @@ struct UnorderedMapTest
     , map(capacity)
     , histogram(map.get_histogram())
   {
-    Kokkos::Impl::Timer wall_clock ;
+    Kokkos::Timer wall_clock ;
     wall_clock.reset();
 
     value_type v = {};
@@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name)
   distance_out << "\b\b\b   " << std::endl;
   block_distance_out << "\b\b\b   " << std::endl;
 
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
   for (int i=0;  i < num_collisions ; ++i) {
     wall_clock.reset();
     std::cout << "Collisions: " << collisions[i] << std::endl;
diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
index 0fc722c140..f72277700a 100644
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -52,6 +52,12 @@
  *   2. Max rank of a DynRankView is 7
  *   3. subview name is subdynrankview
  *   4. Every subdynrankview is returned with LayoutStride
+ *
+ *   NEW: Redesigned DynRankView
+ *   5. subview function name now available
+ *   6. Copy and Copy-Assign View to DynRankView
+ *   7. deep_copy between Views and DynRankViews
+ *   8. rank( view ); returns the rank of View or DynRankView
  */
 
 #ifndef KOKKOS_DYNRANKVIEW_HPP
@@ -64,11 +70,16 @@
 namespace Kokkos {
 namespace Experimental {
 
+template< typename DataType , class ... Properties >
+class DynRankView;  //forward declare
+
 namespace Impl {
 
 template <typename Specialize>
 struct DynRankDimTraits {
 
+  enum : size_t{unspecified = ~size_t(0)};
+
   // Compute the rank of the view from the nonzero dimension arguments.
   KOKKOS_INLINE_FUNCTION
   static size_t computeRank( const size_t N0
@@ -81,13 +92,13 @@ struct DynRankDimTraits {
                            , const size_t N7 )
   {
     return
-      (   (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0 && N2 == 0 && N1 == 0 && N0 == 0) ? 0
-      : ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0 && N2 == 0 && N1 == 0) ? 1
-      : ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0 && N2 == 0) ? 2
-      : ( (N6 == 0 && N5 == 0 && N4 == 0 && N3 == 0) ? 3
-      : ( (N6 == 0 && N5 == 0 && N4 == 0) ? 4
-      : ( (N6 == 0 && N5 == 0) ? 5
-      : ( (N6 == 0) ? 6
+      (   (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified && N0 == unspecified) ? 0
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified && N1 == unspecified) ? 1
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified && N2 == unspecified) ? 2
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified && N3 == unspecified) ? 3
+      : ( (N6 == unspecified && N5 == unspecified && N4 == unspecified) ? 4
+      : ( (N6 == unspecified && N5 == unspecified) ? 5
+      : ( (N6 == unspecified) ? 6
       : 7 ) ) ) ) ) ) );
   }
 
@@ -112,14 +123,14 @@ struct DynRankDimTraits {
   KOKKOS_INLINE_FUNCTION
   static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) , Layout >::type createLayout( const Layout& layout )
   {
-    return Layout( layout.dimension[0] != 0 ? layout.dimension[0] : 1
-                 , layout.dimension[1] != 0 ? layout.dimension[1] : 1
-                 , layout.dimension[2] != 0 ? layout.dimension[2] : 1
-                 , layout.dimension[3] != 0 ? layout.dimension[3] : 1
-                 , layout.dimension[4] != 0 ? layout.dimension[4] : 1
-                 , layout.dimension[5] != 0 ? layout.dimension[5] : 1
-                 , layout.dimension[6] != 0 ? layout.dimension[6] : 1
-                 , layout.dimension[7] != 0 ? layout.dimension[7] : 1
+    return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1
+                 , layout.dimension[1] != unspecified ? layout.dimension[1] : 1
+                 , layout.dimension[2] != unspecified ? layout.dimension[2] : 1
+                 , layout.dimension[3] != unspecified ? layout.dimension[3] : 1
+                 , layout.dimension[4] != unspecified ? layout.dimension[4] : 1
+                 , layout.dimension[5] != unspecified ? layout.dimension[5] : 1
+                 , layout.dimension[6] != unspecified ? layout.dimension[6] : 1
+                 , layout.dimension[7] != unspecified ? layout.dimension[7] : 1
                  );
   }
 
@@ -128,21 +139,21 @@ struct DynRankDimTraits {
   KOKKOS_INLINE_FUNCTION
   static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) , Layout>::type createLayout( const Layout& layout )
   {
-    return Layout( layout.dimension[0] != 0 ? layout.dimension[0] : 1
+    return Layout( layout.dimension[0] != unspecified ? layout.dimension[0] : 1
                  , layout.stride[0] 
-                 , layout.dimension[1] != 0 ? layout.dimension[1] : 1
+                 , layout.dimension[1] != unspecified ? layout.dimension[1] : 1
                  , layout.stride[1] 
-                 , layout.dimension[2] != 0 ? layout.dimension[2] : 1
+                 , layout.dimension[2] != unspecified ? layout.dimension[2] : 1
                  , layout.stride[2] 
-                 , layout.dimension[3] != 0 ? layout.dimension[3] : 1
+                 , layout.dimension[3] != unspecified ? layout.dimension[3] : 1
                  , layout.stride[3] 
-                 , layout.dimension[4] != 0 ? layout.dimension[4] : 1
+                 , layout.dimension[4] != unspecified ? layout.dimension[4] : 1
                  , layout.stride[4] 
-                 , layout.dimension[5] != 0 ? layout.dimension[5] : 1
+                 , layout.dimension[5] != unspecified ? layout.dimension[5] : 1
                  , layout.stride[5] 
-                 , layout.dimension[6] != 0 ? layout.dimension[6] : 1
+                 , layout.dimension[6] != unspecified ? layout.dimension[6] : 1
                  , layout.stride[6] 
-                 , layout.dimension[7] != 0 ? layout.dimension[7] : 1
+                 , layout.dimension[7] != unspecified ? layout.dimension[7] : 1
                  , layout.stride[7] 
                  );
   }
@@ -161,17 +172,141 @@ struct DynRankDimTraits {
                             , const size_t N7 )
   {
     return ViewType( arg
-                   , N0 != 0 ? N0 : 1
-                   , N1 != 0 ? N1 : 1
-                   , N2 != 0 ? N2 : 1
-                   , N3 != 0 ? N3 : 1
-                   , N4 != 0 ? N4 : 1
-                   , N5 != 0 ? N5 : 1
-                   , N6 != 0 ? N6 : 1
-                   , N7 != 0 ? N7 : 1 );
+                   , N0 != unspecified ? N0 : 1
+                   , N1 != unspecified ? N1 : 1
+                   , N2 != unspecified ? N2 : 1
+                   , N3 != unspecified ? N3 : 1
+                   , N4 != unspecified ? N4 : 1
+                   , N5 != unspecified ? N5 : 1
+                   , N6 != unspecified ? N6 : 1
+                   , N7 != unspecified ? N7 : 1 );
   }
 };
 
+  // Non-strided Layout
+  template <typename Layout , typename iType>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutRight>::value || std::is_same<Layout , Kokkos::LayoutLeft>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  {
+    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0) 
+                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
+                 , dynrank > 2 ? layout.dimension[2] : ~size_t(0)
+                 , dynrank > 3 ? layout.dimension[3] : ~size_t(0)
+                 , dynrank > 4 ? layout.dimension[4] : ~size_t(0)
+                 , dynrank > 5 ? layout.dimension[5] : ~size_t(0)
+                 , dynrank > 6 ? layout.dimension[6] : ~size_t(0)
+                 , dynrank > 7 ? layout.dimension[7] : ~size_t(0)
+                 );
+  }
+
+  // LayoutStride
+  template <typename Layout , typename iType>
+  KOKKOS_INLINE_FUNCTION
+  static typename std::enable_if< (std::is_same<Layout , Kokkos::LayoutStride>::value) && std::is_integral<iType>::value , Layout >::type reconstructLayout( const Layout& layout , iType dynrank )
+  {
+    return Layout( dynrank > 0 ? layout.dimension[0] : ~size_t(0)
+                 , dynrank > 0 ? layout.stride[0] : (0) 
+                 , dynrank > 1 ? layout.dimension[1] : ~size_t(0)
+                 , dynrank > 1 ? layout.stride[1] : (0) 
+                 , dynrank > 2 ? layout.dimension[2] : ~size_t(0)
+                 , dynrank > 2 ? layout.stride[2] : (0) 
+                 , dynrank > 3 ? layout.dimension[3] : ~size_t(0)
+                 , dynrank > 3 ? layout.stride[3] : (0) 
+                 , dynrank > 4 ? layout.dimension[4] : ~size_t(0)
+                 , dynrank > 4 ? layout.stride[4] : (0) 
+                 , dynrank > 5 ? layout.dimension[5] : ~size_t(0)
+                 , dynrank > 5 ? layout.stride[5] : (0) 
+                 , dynrank > 6 ? layout.dimension[6] : ~size_t(0)
+                 , dynrank > 6 ? layout.stride[6] : (0) 
+                 , dynrank > 7 ? layout.dimension[7] : ~size_t(0)
+                 , dynrank > 7 ? layout.stride[7] : (0) 
+                 );
+  }
+
+  template < typename DynRankViewType , typename iType >
+  void verify_dynrankview_rank ( iType N , const DynRankViewType &drv )
+  {
+    if ( static_cast<iType>(drv.rank()) > N )
+       {
+         Kokkos::abort( "Need at least rank arguments to the operator()" ); 
+       }
+  }
+
+
+/** \brief  Assign compatible default mappings */
+struct ViewToDynRankViewTag {};
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    std::is_same< typename SrcTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
+    )
+  ) , ViewToDynRankViewTag >::type >
+{
+private:
+
+  enum { is_assignable_value_type =
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::value_type >::value ||
+    std::is_same< typename DstTraits::value_type
+                , typename SrcTraits::const_value_type >::value };
+
+  enum { is_assignable_layout =
+    std::is_same< typename DstTraits::array_layout
+                , typename SrcTraits::array_layout >::value ||
+    std::is_same< typename DstTraits::array_layout
+                , Kokkos::LayoutStride >::value 
+    };
+
+public:
+
+  enum { is_assignable = is_assignable_value_type &&
+                         is_assignable_layout };
+
+  typedef ViewMapping< DstTraits , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void >  SrcType ;
+
+  template < typename DT , typename ... DP , typename ST , typename ... SP >
+  KOKKOS_INLINE_FUNCTION
+  static void assign( Kokkos::Experimental::DynRankView< DT , DP...> & dst ,  const Kokkos::View< ST , SP... > & src )
+    {
+      static_assert( is_assignable_value_type
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( is_assignable_layout
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+    // Removed dimension checks...
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
+      dst.m_map.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_track.assign( src.m_track , DstTraits::is_managed );
+      dst.m_rank = src.Rank ;
+    }
+};
+
 } //end Impl
 
 /* \class DynRankView
@@ -185,145 +320,228 @@ struct DynRankDimTraits {
  *   3. subview name is subdynrankview
  *   4. Every subdynrankview is returned with LayoutStride
  *
+ *   NEW: Redesigned DynRankView
+ *   5. subview function name now available
+ *   6. Copy and Copy-Assign View to DynRankView
+ *   7. deep_copy between Views and DynRankViews
+ *   8. rank( view ); returns the rank of View or DynRankView
+ *
  */
 
+template< class > struct is_dyn_rank_view : public std::false_type {};
+
+template< class D, class ... P >
+struct is_dyn_rank_view< Kokkos::Experimental::DynRankView<D,P...> > : public std::true_type {};
+
+
 template< typename DataType , class ... Properties >
-class DynRankView : private View< DataType*******, Properties... >
+class DynRankView : public ViewTraits< DataType , Properties ... >
 {
   static_assert( !std::is_array<DataType>::value && !std::is_pointer<DataType>::value , "Cannot template DynRankView with array or pointer datatype - must be pod" );
 
-public: 
-  using view_type = View< DataType******* , Properties...>;
-  using reference_type = typename view_type::reference_type; 
-
 private: 
   template < class , class ... > friend class DynRankView ;
-  template< class , class ... > friend class Impl::ViewMapping ;
-  unsigned m_rank;
-
-public:
-  KOKKOS_INLINE_FUNCTION
-  view_type & DownCast() const { return static_cast< view_type & > (*this); }
-  KOKKOS_INLINE_FUNCTION
-  const view_type & ConstDownCast() const { return static_cast< const view_type & > (*this); }
+//  template < class , class ... > friend class Kokkos::Experimental::View ; //unnecessary now...
+  template < class , class ... > friend class Impl::ViewMapping ;
 
-  typedef ViewTraits< DataType , Properties ... > traits ;
+public: 
+  typedef ViewTraits< DataType , Properties ... > drvtraits ;
 
-  // Data type traits:
-  typedef typename traits::data_type            data_type;
-  typedef typename traits::const_data_type      const_data_type;
-  typedef typename traits::non_const_data_type  non_const_data_type;
+  typedef View< DataType******* , Properties...> view_type ; 
 
-  // Compatible array of trivial type traits:
-  typedef typename traits::scalar_array_type            scalar_array_type ;
-  typedef typename traits::const_scalar_array_type      const_scalar_array_type ;
-  typedef typename traits::non_const_scalar_array_type  non_const_scalar_array_type ;
+  typedef ViewTraits< DataType******* , Properties ... > traits ;
 
-  // Value type traits:
-  typedef typename traits::value_type            value_type ;
-  typedef typename traits::const_value_type      const_value_type ;
-  typedef typename traits::non_const_value_type  non_const_value_type ;
 
-  // Mapping traits:
-  typedef typename traits::array_layout   array_layout ;
-  typedef typename traits::specialize     specialize ;
+private:
+  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
 
-  // Execution space, memory space, memory access traits, and host mirror space:
-  typedef typename traits::execution_space    execution_space ;
-  typedef typename traits::memory_space       memory_space ;
-  typedef typename traits::device_type        device_type ;
-  typedef typename traits::memory_traits      memory_traits ;
-  typedef typename traits::host_mirror_space  host_mirror_space ;
+  track_type  m_track ;
+  map_type    m_map ;
+  unsigned m_rank;
 
-  typedef typename traits::size_type size_type ;
+public: 
+  KOKKOS_INLINE_FUNCTION
+  view_type & DownCast() const { return ( view_type & ) (*this); }
+  KOKKOS_INLINE_FUNCTION
+  const view_type & ConstDownCast() const { return (const view_type & ) (*this); }
 
-  using view_type::is_hostspace ;
-  using view_type::is_managed ;
-  using view_type::is_random_access ;
+  //Types below - at least the HostMirror requires the value_type, NOT the rank 7 data_type of the traits
 
   /** \brief  Compatible view of array of scalar types */
-  typedef DynRankView< typename traits::scalar_array_type ,
-                       typename traits::array_layout ,
-                       typename traits::device_type ,
-                       typename traits::memory_traits >
+  typedef DynRankView< typename drvtraits::scalar_array_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
     array_type ;
 
   /** \brief  Compatible view of const data type */
-  typedef DynRankView< typename traits::const_data_type ,
-                       typename traits::array_layout ,
-                       typename traits::device_type ,
-                       typename traits::memory_traits >
+  typedef DynRankView< typename drvtraits::const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
     const_type ;
 
   /** \brief  Compatible view of non-const data type */
-  typedef DynRankView< typename traits::non_const_data_type ,
-                       typename traits::array_layout ,
-                       typename traits::device_type ,
-                       typename traits::memory_traits >
+  typedef DynRankView< typename drvtraits::non_const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::device_type ,
+                typename drvtraits::memory_traits >
     non_const_type ;
 
   /** \brief  Compatible HostMirror view */
-  typedef DynRankView< typename traits::non_const_data_type ,
-                       typename traits::array_layout ,
-                       typename traits::host_mirror_space >
+  typedef DynRankView< typename drvtraits::non_const_data_type ,
+                typename drvtraits::array_layout ,
+                typename drvtraits::host_mirror_space >
     HostMirror ;
 
+
   //----------------------------------------
   // Domain rank and extents
 
+//  enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the enum?
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  extent( const iType & r ) const
+    { return m_map.extent(r); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , int >::type
+  extent_int( const iType & r ) const
+    { return static_cast<int>(m_map.extent(r)); }
+
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename traits::array_layout layout() const
+    { return m_map.layout(); }
+
+  //----------------------------------------
+  /*  Deprecate all 'dimension' functions in favor of
+   *  ISO/C++ vocabulary 'extent'.
+   */
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  dimension( const iType & r ) const { return extent( r ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
+                                                                m_map.dimension_1() *
+                                                                m_map.dimension_2() *
+                                                                m_map.dimension_3() *
+                                                                m_map.dimension_4() *
+                                                                m_map.dimension_5() *
+                                                                m_map.dimension_6() *
+                                                                m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
+
+  //----------------------------------------
+  // Range span is the span which contains all members.
+
+  typedef typename map_type::reference_type  reference_type ;
+  typedef typename map_type::pointer_type    pointer_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+  // Deprecated, use 'span()' instead
+  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
+
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
+
+  //----------------------------------------
+  // Allow specializations to query their specialized map
+
   KOKKOS_INLINE_FUNCTION
-  DynRankView() : view_type() , m_rank(0) {}
+  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  implementation_map() const { return m_map ; }
+
+  //----------------------------------------
+
+private:
+
+  enum {
+    is_layout_left = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutLeft >::value ,
+
+    is_layout_right = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutRight >::value ,
+
+    is_layout_stride = std::is_same< typename traits::array_layout
+                                   , Kokkos::LayoutStride >::value ,
+
+    is_default_map =
+      std::is_same< typename traits::specialize , void >::value &&
+      ( is_layout_left || is_layout_right || is_layout_stride )
+  };
+
+// Bounds checking macros
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+
+#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
+  Kokkos::Experimental::Impl::verify_dynrankview_rank ( N , *this ) ; \
+  Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ; 
+
+#else
+
+#define KOKKOS_VIEW_OPERATOR_VERIFY( N , ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
+
+#endif
+
+public:
 
   KOKKOS_INLINE_FUNCTION
   constexpr unsigned rank() const { return m_rank; }
 
-  using view_type::extent; 
-  using view_type::extent_int; 
-  using view_type::layout;
-  using view_type::dimension;
-  using view_type::size;
-  using view_type::stride;
-
-  using pointer_type = typename view_type::pointer_type;
-  using view_type::reference_type_is_lvalue_reference;
-  using view_type::span;
-  using view_type::capacity;
-  using view_type::span_is_contiguous;
-  using view_type::data;
-  using view_type::implementation_map;
-
-  using view_type::is_contiguous;
-  using view_type::ptr_on_device;
-
-  //Deprecated, remove soon (add for test)
-  using view_type::dimension_0;
-  using view_type::dimension_1;
-  using view_type::dimension_2;
-  using view_type::dimension_3;
-  using view_type::dimension_4;
-  using view_type::dimension_5;
-  using view_type::dimension_6;
-  using view_type::dimension_7;
-  using view_type::stride_0;
-  using view_type::stride_1;
-  using view_type::stride_2;
-  using view_type::stride_3;
-  using view_type::stride_4;
-  using view_type::stride_5;
-  using view_type::stride_6;
-  using view_type::stride_7;
 
   //operators ()
   // Rank 0
   KOKKOS_INLINE_FUNCTION
   reference_type operator()() const
-    { return view_type::operator()(0,0,0,0,0,0,0); }
-  
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 0 , ( implementation_map() ) )
+      return implementation_map().reference();
+      //return m_map.reference(0,0,0,0,0,0,0); 
+    }
+
   // Rank 1
   // This assumes a contiguous underlying memory (i.e. no padding, no striding...)
   template< typename iType >
   KOKKOS_INLINE_FUNCTION
-  typename std::enable_if< std::is_same<value_type, scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
+  typename std::enable_if< std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
   operator[](const iType & i0) const
     {
       return data()[i0];
@@ -333,59 +551,141 @@ public:
   // AND a Trilinos/Sacado scalar type )
   template< typename iType >
   KOKKOS_INLINE_FUNCTION
-  typename std::enable_if< !std::is_same<value_type, scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
+  typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
   operator[](const iType & i0) const
     {
-      auto map = implementation_map();
-
-      const size_t dim_scalar = map.dimension_scalar();
+//      auto map = implementation_map();
+      const size_t dim_scalar = m_map.dimension_scalar();
       const size_t bytes = this->span() / dim_scalar;
 
-      typedef Kokkos::View<DataType*, array_layout, device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | memory_traits::RandomAccess | memory_traits::Atomic> > tmp_view_type;
+      typedef Kokkos::View<DataType*, typename traits::array_layout, typename traits::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged | traits::memory_traits::RandomAccess | traits::memory_traits::Atomic> > tmp_view_type;
       tmp_view_type rankone_view(this->data(), bytes, dim_scalar);
       return rankone_view(i0);
     }
 
   template< typename iType >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType & i0 ) const 
-    { return view_type::operator()(i0,0,0,0,0,0,0); }
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  operator()(const iType & i0 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 1 , ( m_map , i0 ) )
+      return m_map.reference(i0); 
+    }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename traits::specialize , void>::value && std::is_integral<iType>::value), reference_type>::type
+  operator()(const iType & i0 ) const
+    {
+      return m_map.reference(i0,0,0,0,0,0,0);
+    }
 
   // Rank 2
   template< typename iType0 , typename iType1 >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType0 & i0 , const iType1 & i1 ) const 
-    { return view_type::operator()(i0,i1,0,0,0,0,0); }
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
+      return m_map.reference(i0,i1); 
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 2 , ( m_map , i0 , i1 ) )
+      return m_map.reference(i0,i1,0,0,0,0,0); 
+    }
 
   // Rank 3
   template< typename iType0 , typename iType1 , typename iType2 >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const 
-    { return view_type::operator()(i0,i1,i2,0,0,0,0); }
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
+      return m_map.reference(i0,i1,i2); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 3 , ( m_map , i0 , i1 , i2 ) )
+      return m_map.reference(i0,i1,i2,0,0,0,0); 
+    }
 
   // Rank 4
   template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const 
-    { return view_type::operator()(i0,i1,i2,i3,0,0,0); }
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
+      return m_map.reference(i0,i1,i2,i3); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 4 , ( m_map , i0 , i1 , i2 , i3 ) )
+      return m_map.reference(i0,i1,i2,i3,0,0,0); 
+    }
 
   // Rank 5
   template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const 
-    { return view_type::operator()(i0,i1,i2,i3,i4,0,0); }
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
+      return m_map.reference(i0,i1,i2,i3,i4); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 5 , ( m_map , i0 , i1 , i2 , i3 , i4 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,0,0); 
+    }
 
   // Rank 6
   template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const 
-    { return view_type::operator()(i0,i1,i2,i3,i4,i5,0); }
+  typename std::enable_if< (std::is_same<typename traits::specialize , void>::value && std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5); 
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< !(std::is_same<typename drvtraits::specialize , void>::value && std::is_integral<iType0>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 6 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,0); 
+    }
 
   // Rank 7
   template< typename iType0 , typename iType1 , typename iType2 , typename iType3, typename iType4 , typename iType5 , typename iType6 >
   KOKKOS_INLINE_FUNCTION
-  reference_type operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const 
-    { return view_type::operator()(i0,i1,i2,i3,i4,i5,i6); }
+  typename std::enable_if< (std::is_integral<iType0>::value  && std::is_integral<iType1>::value && std::is_integral<iType2>::value && std::is_integral<iType3>::value && std::is_integral<iType4>::value && std::is_integral<iType5>::value && std::is_integral<iType6>::value), reference_type>::type
+  operator()(const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 , const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const 
+    { 
+      KOKKOS_VIEW_OPERATOR_VERIFY( 7 , ( m_map , i0 , i1 , i2 , i3 , i4 , i5 , i6 ) )
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6); 
+    }
+
+#undef KOKKOS_VIEW_OPERATOR_VERIFY
 
   //----------------------------------------
   // Standard constructor, destructor, and assignment operators... 
@@ -394,46 +694,89 @@ public:
   ~DynRankView() {}
 
   KOKKOS_INLINE_FUNCTION
-  DynRankView( const DynRankView & ) = default ;
+  DynRankView() : m_track(), m_map(), m_rank() {} //Default ctor
 
   KOKKOS_INLINE_FUNCTION
-  DynRankView( DynRankView && ) = default ;
+  DynRankView( const DynRankView & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {}
 
   KOKKOS_INLINE_FUNCTION
-  DynRankView & operator = ( const DynRankView & ) = default ;
+  DynRankView( DynRankView && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ), m_rank(rhs.m_rank) {}
 
   KOKKOS_INLINE_FUNCTION
-  DynRankView & operator = ( DynRankView && ) = default ;
+  DynRankView & operator = ( const DynRankView & rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; }
+
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( DynRankView && rhs ) { m_track = rhs.m_track; m_map = rhs.m_map; m_rank = rhs.m_rank; return *this; } 
 
   //----------------------------------------
   // Compatible view copy constructor and assignment
   // may assign unmanaged from managed.
-
   template< class RT , class ... RP >
   KOKKOS_INLINE_FUNCTION
   DynRankView( const DynRankView<RT,RP...> & rhs )
-    : view_type( rhs.ConstDownCast() )  
+    : m_track( rhs.m_track , traits::is_managed )
+    , m_map()
     , m_rank(rhs.m_rank)
-    {}
+    {
+      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
 
   template< class RT , class ... RP >
   KOKKOS_INLINE_FUNCTION
   DynRankView & operator = (const DynRankView<RT,RP...> & rhs )
-  {
-    view_type::operator = ( rhs.ConstDownCast() );
-    m_rank = rhs.rank();
-    return *this;
-  }
+    {
+      typedef typename DynRankView<RT,RP...> ::traits SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.assign( rhs.m_track , traits::is_managed );
+      m_rank = rhs.rank();
+      return *this;
+    }
+
+// Experimental
+// Copy/Assign View to DynRankView
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView( const View<RT,RP...> & rhs )
+    : m_track()
+    , m_map()
+    , m_rank( rhs.Rank )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynRankView copy construction" );
+      Mapping::assign( *this , rhs );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  DynRankView & operator = ( const View<RT,RP...> & rhs )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , Kokkos::Experimental::Impl::ViewToDynRankViewTag >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View to DynRankView copy assignment" );
+      Mapping::assign( *this , rhs );
+      return *this ;
+    }
 
   //----------------------------------------
   // Allocation tracking properties
 
-  using view_type::use_count;
-  using view_type::label;
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
 
   //----------------------------------------
   // Allocation according to allocation properties and array layout
-
+  // unused arg_layout dimensions must be set to ~size_t(0) so that rank deduction can properly take place
   template< class ... P >
   explicit inline
   DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
@@ -441,12 +784,77 @@ public:
                                , typename traits::array_layout
                                >::type const & arg_layout
       )
-      : view_type( arg_prop
-                 , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
+      : m_track()
+      , m_map()
       , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
-    {}
+    {
+      // Append layout and spaces if not input
+      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
+
+      // use 'std::integral_constant<unsigned,I>' for non-types
+      // to avoid duplicate class error.
+      typedef Impl::ViewCtorProp
+        < P ...
+        , typename std::conditional
+            < alloc_prop_input::has_label
+            , std::integral_constant<unsigned,0>
+            , typename std::string
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_memory_space
+            , std::integral_constant<unsigned,1>
+            , typename traits::device_type::memory_space
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_execution_space
+            , std::integral_constant<unsigned,2>
+            , typename traits::device_type::execution_space
+            >::type
+        > alloc_prop ;
+
+      static_assert( traits::is_managed
+                   , "View allocation constructor requires managed memory" );
+
+      if ( alloc_prop::initialize &&
+           ! alloc_prop::execution_space::is_initialized() ) {
+        // If initializing view data then
+        // the execution space must be initialized.
+        Kokkos::Impl::throw_runtime_exception("Constructing DynRankView and initializing data with uninitialized execution space");
+      }
+
+      // Copy the input allocation properties with possibly defaulted properties
+      alloc_prop prop( arg_prop );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      // If allocating in CudaUVMSpace must fence before and after
+      // the allocation to protect against possible concurrent access
+      // on the CPU and the GPU.
+      // Fence using the trait's executon space (which will be Kokkos::Cuda)
+      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
 
-//Wrappers
+      Kokkos::Experimental::Impl::SharedAllocationRecord<> *
+        record = m_map.allocate_shared( prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      // Setup and initialization complete, start tracking
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+
+  // Wrappers
   template< class ... P >
   explicit KOKKOS_INLINE_FUNCTION
   DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
@@ -454,10 +862,16 @@ public:
                                , typename traits::array_layout
                                >::type const & arg_layout
       )
-      : view_type( arg_prop
-                 , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
+      : m_track() // No memory tracking
+      , m_map( arg_prop , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout) )
       , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank(arg_layout) )
-    {}
+    {
+      static_assert(
+        std::is_same< pointer_type
+                    , typename Impl::ViewCtorProp< P... >::pointer_type
+                    >::value ,
+        "Constructing DynRankView to wrap user memory must supply matching pointer type" );
+    }
 
   //----------------------------------------
   //Constructor(s)
@@ -468,14 +882,14 @@ public:
   DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
       , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
                                , size_t
-                               >::type const arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
+                               >::type const arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
       )
     : DynRankView( arg_prop
     , typename traits::array_layout
@@ -488,14 +902,14 @@ public:
   DynRankView( const Impl::ViewCtorProp< P ... > & arg_prop
       , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
                                , size_t
-                               >::type const arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
+                               >::type const arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
       )
     : DynRankView( arg_prop
     , typename traits::array_layout
@@ -514,20 +928,20 @@ public:
     : DynRankView( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
     {}
 
-  // Allocate label and layout
+  // Allocate label and layout, must disambiguate from subview constructor
   template< typename Label >
   explicit inline
   DynRankView( const Label & arg_label
       , typename std::enable_if<
           Kokkos::Experimental::Impl::is_view_label<Label>::value ,
-        const size_t >::type arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
+        const size_t >::type arg_N0 = ~size_t(0) 
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
       )
     : DynRankView( Impl::ViewCtorProp< std::string >( arg_label )
     , typename traits::array_layout
@@ -536,44 +950,58 @@ public:
     {}
 
   // For backward compatibility
-/*
   explicit inline
   DynRankView( const ViewAllocateWithoutInitializing & arg_prop
       , const typename traits::array_layout & arg_layout
       )
-    : view_type( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
-          , arg_layout
-          )
-    //, m_rank(arg_N0 == 0 ? 0 : ( arg_N1 == 0 ? 1 : ( arg_N2 == 0 ? 2 : ( arg_N3 == 0 ? 3 : ( arg_N4 == 0 ? 4 : ( arg_N5 == 0 ? 5 : ( arg_N6 == 0 ? 6 : ( arg_N7 == 0 ? 7 : 8 ) ) ) ) ) ) ) ) //how to extract rank?
+    : DynRankView( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
+          , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout)
+      )
     {}
-*/
 
   explicit inline
   DynRankView( const ViewAllocateWithoutInitializing & arg_prop
-      , const size_t arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
       )
     : DynRankView(Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 ) 
     {}
 
-  using view_type::memory_span;
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+                                       const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span(
+        typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+    }
 
   explicit KOKKOS_INLINE_FUNCTION
   DynRankView( pointer_type arg_ptr
-      , const size_t arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0)
       )
     : DynRankView( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
     {}
@@ -589,39 +1017,76 @@ public:
   //----------------------------------------
   // Shared scratch memory constructor
 
-  using view_type::shmem_size; 
+  static inline
+  size_t shmem_size( const size_t arg_N0 = ~size_t(0) ,
+                     const size_t arg_N1 = ~size_t(0) ,
+                     const size_t arg_N2 = ~size_t(0) ,
+                     const size_t arg_N3 = ~size_t(0) ,
+                     const size_t arg_N4 = ~size_t(0) ,
+                     const size_t arg_N5 = ~size_t(0) ,
+                     const size_t arg_N6 = ~size_t(0) ,
+                     const size_t arg_N7 = ~size_t(0) )
+  {
+    const size_t num_passed_args =
+      ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) +
+      ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) +
+      ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) );
+
+    if ( std::is_same<typename traits::specialize , void>::value && num_passed_args != traits::rank_dynamic ) {
+      Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" );
+    }
+    {}
+
+    return map_type::memory_span(
+           typename traits::array_layout
+            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const typename traits::array_layout & arg_layout )
+    : DynRankView( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem( map_type::memory_span( 
+                  Impl::DynRankDimTraits<typename traits::specialize>::createLayout( arg_layout ) //is this correct?
+                ) ) ) )
+         , arg_layout )
+     {}
 
   explicit KOKKOS_INLINE_FUNCTION
   DynRankView( const typename traits::execution_space::scratch_memory_space & arg_space
-      , const size_t arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0 )
-    : view_type( Impl::DynRankDimTraits<typename traits::specialize>::template createView<view_type>( arg_space
-                                                                                                    , arg_N0
-                                                                                                    , arg_N1
-                                                                                                    , arg_N2
-                                                                                                    , arg_N3
-                                                                                                    , arg_N4
-                                                                                                    , arg_N5
-                                                                                                    , arg_N6
-                                                                                                    , arg_N7 ) )
-    , m_rank( Impl::DynRankDimTraits<typename traits::specialize>::computeRank( arg_N0
-                                                                              , arg_N1
-                                                                              , arg_N2
-                                                                              , arg_N3
-                                                                              , arg_N4
-                                                                              , arg_N5
-                                                                              , arg_N6
-                                                                              , arg_N7 ) )
+      , const size_t arg_N0 = ~size_t(0)
+      , const size_t arg_N1 = ~size_t(0)
+      , const size_t arg_N2 = ~size_t(0)
+      , const size_t arg_N3 = ~size_t(0)
+      , const size_t arg_N4 = ~size_t(0)
+      , const size_t arg_N5 = ~size_t(0)
+      , const size_t arg_N6 = ~size_t(0)
+      , const size_t arg_N7 = ~size_t(0) )
+
+    : DynRankView( Impl::ViewCtorProp<pointer_type>(
+                   reinterpret_cast<pointer_type>(
+                     arg_space.get_shmem(
+                       map_type::memory_span(
+                       Impl::DynRankDimTraits<typename traits::specialize>::createLayout(
+                       typename traits::array_layout
+                       ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                       , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) ) 
+                    )
+                  , typename traits::array_layout
+                    ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                    , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+        )
     {}
 
 };
 
+
+  template < typename D , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank( const DynRankView<D , P...> & DRV ) { return DRV.rank(); } //needed for transition to common constexpr method in view and dynrankview to return rank
+
 //----------------------------------------------------------------------------
 // Subview mapping.
 // Deduce destination view type from source view traits and subview arguments
@@ -719,11 +1184,11 @@ public:
 
   template < typename T , class ... P >
   KOKKOS_INLINE_FUNCTION
-  static ret_type subview( const unsigned src_rank , Kokkos::Experimental::View< T******* , P...> const & src 
+  static ret_type subview( const unsigned src_rank , Kokkos::Experimental::DynRankView< T , P...> const & src 
                     , Args ... args )
     {
 
-      typedef ViewMapping< traits_type, void >  DstType ;
+       typedef ViewMapping< traits_type, void >  DstType ;
 
        typedef typename std::conditional< (rank==0) , ViewDimension<>
                                                     , typename std::conditional< (rank==1) , ViewDimension<0>
@@ -801,13 +1266,21 @@ subdynrankview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args.
   
     typedef Kokkos::Experimental::Impl::ViewMapping< Kokkos::Experimental::Impl::DynRankSubviewTag , Kokkos::Experimental::ViewTraits< D*******, P... > , Args... > metafcn ;
 
-    return metafcn::subview( src.rank() , src.ConstDownCast() , args... );
+    return metafcn::subview( src.rank() , src , args... );
+  }
+
+//Wrapper to allow subview function name
+template< class D , class ... P , class ...Args >
+KOKKOS_INLINE_FUNCTION
+Subdynrankview< ViewTraits<D******* , P...> , Args... > 
+subview( const Kokkos::Experimental::DynRankView< D , P... > &src , Args...args)
+  {
+    return subdynrankview( src , args... );
   }
 
 } // namespace Experimental
 } // namespace Kokkos
 
-
 namespace Kokkos {
 namespace Experimental {
 
@@ -854,6 +1327,109 @@ bool operator != ( const DynRankView<LT,LP...> & lhs ,
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class OutputView , typename Enable = void >
+struct DynRankViewFill {
+
+  typedef typename OutputView::traits::const_value_type  const_value_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    const size_t n1 = output.dimension_1();
+    const size_t n2 = output.dimension_2();
+    const size_t n3 = output.dimension_3();
+    const size_t n4 = output.dimension_4();
+    const size_t n5 = output.dimension_5();
+    const size_t n6 = output.dimension_6();
+
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+      output(i0,i1,i2,i3,i4,i5,i6) = input ;
+    }}}}}}
+  }
+
+  DynRankViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space  execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
+
+      const Kokkos::Impl::ParallelFor< DynRankViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) );
+
+      closure.execute();
+
+      execution_space::fence();
+    }
+};
+
+template< class OutputView >
+struct DynRankViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > { 
+  DynRankViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
+    {
+      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
+        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
+    }
+};
+
+template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space >
+struct DynRankViewRemap {
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_t n0 ;
+  const size_t n1 ;
+  const size_t n2 ;
+  const size_t n3 ;
+  const size_t n4 ;
+  const size_t n5 ;
+  const size_t n6 ;
+  const size_t n7 ;
+
+  DynRankViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef Kokkos::RangePolicy< ExecSpace > Policy ;
+      const Kokkos::Impl::ParallelFor< DynRankViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
+      closure.execute();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+      output(i0,i1,i2,i3,i4,i5,i6) = input(i0,i1,i2,i3,i4,i5,i6);
+    }}}}}}
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
 
 namespace Kokkos {
 namespace Experimental {
@@ -863,9 +1439,17 @@ template< class DT , class ... DP >
 inline
 void deep_copy
   ( const DynRankView<DT,DP...> & dst
-  , typename ViewTraits<DT,DP...>::const_value_type & value )
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
 {
-  deep_copy( dst.ConstDownCast() , value );
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  Kokkos::Experimental::Impl::DynRankViewFill< DynRankView<DT,DP...> >( dst , value );
 }
 
 /** \brief  Deep copy into a value in Host memory from a view.  */
@@ -873,21 +1457,156 @@ template< class ST , class ... SP >
 inline
 void deep_copy
   ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
-  , const DynRankView<ST,SP...> & src )
+  , const DynRankView<ST,SP...> & src
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
 {
-  deep_copy( dst , src.ConstDownCast() );
-}
+  if ( src.rank() != 0 )
+  {
+    Kokkos::abort("");
+  }
 
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
+}
 
 //----------------------------------------------------------------------------
-/** \brief  A deep copy between views of compatible type */
-template< class DT , class ... DP , class ST , class ... SP >
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same rank, same contiguous layout.
+ */
+template< class DstType , class SrcType >
 inline
 void deep_copy
-  ( const DynRankView<DT,DP...> & dst
-  , const DynRankView<ST,SP...> & src )
+  ( const DstType & dst
+  , const SrcType & src
+  , typename std::enable_if<(
+    std::is_same< typename DstType::traits::specialize , void >::value &&
+    std::is_same< typename SrcType::traits::specialize , void >::value
+    &&
+    ( Kokkos::Experimental::is_dyn_rank_view<DstType>::value || Kokkos::Experimental::is_dyn_rank_view<SrcType>::value)
+  )>::type * = 0 )
 {
-  deep_copy( dst.ConstDownCast() , src.ConstDownCast() );
+  static_assert(
+    std::is_same< typename DstType::traits::value_type ,
+                  typename DstType::traits::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  typedef DstType  dst_type ;
+  typedef SrcType  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+    if ( rank(src) == 0 && rank(dst) == 0 )
+    { 
+      typedef typename dst_type::value_type    value_type ;
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) ); 
+    }
+    else if ( std::is_same< typename DstType::traits::value_type ,
+                       typename SrcType::traits::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename DstType::traits::array_layout ,
+                           typename SrcType::traits::array_layout >::value
+             &&
+             ( std::is_same< typename DstType::traits::array_layout ,
+                             typename Kokkos::LayoutLeft>::value
+             ||
+               std::is_same< typename DstType::traits::array_layout ,
+                             typename Kokkos::LayoutRight>::value
+             )
+           )
+           ||
+           (
+             rank(dst) == 1
+             &&
+             rank(src) == 1
+           )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( std::is_same< typename DstType::traits::value_type ,
+                            typename SrcType::traits::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename DstType::traits::array_layout ,
+                           typename SrcType::traits::array_layout >::value
+             &&
+             std::is_same< typename DstType::traits::array_layout ,
+                          typename Kokkos::LayoutStride>::value
+           )
+           ||
+           (
+             rank(dst) == 1
+             &&
+             rank(src) == 1
+           )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() &&
+         dst.stride_0() == src.stride_0() &&
+         dst.stride_1() == src.stride_1() &&
+         dst.stride_2() == src.stride_2() &&
+         dst.stride_3() == src.stride_3() &&
+         dst.stride_4() == src.stride_4() &&
+         dst.stride_5() == src.stride_5() &&
+         dst.stride_6() == src.stride_6() &&
+         dst.stride_7() == src.stride_7()
+         ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::DynRankViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::DynRankViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
 }
 
 } //end Experimental
@@ -900,6 +1619,48 @@ void deep_copy
 namespace Kokkos {
 namespace Experimental {
 
+namespace Impl {
+
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorDRViewType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::DynRankView<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::DynRankView<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorDRVType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::DynRankView<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::DynRankView<data_type,array_layout,Space> view_type;
+};
+
+}
+
+
 template< class T , class ... P >
 inline
 typename DynRankView<T,P...>::HostMirror
@@ -914,14 +1675,7 @@ create_mirror( const DynRankView<T,P...> & src
   typedef typename src_type::HostMirror  dst_type ;
 
   return dst_type( std::string( src.label() ).append("_mirror")
-                 , src.dimension(0)
-                 , src.dimension(1)
-                 , src.dimension(2)
-                 , src.dimension(3)
-                 , src.dimension(4)
-                 , src.dimension(5)
-                 , src.dimension(6)
-                 , src.dimension(7) );
+                 , Impl::reconstructLayout(src.layout(), src.rank()) );
 }
 
 
@@ -938,27 +1692,15 @@ create_mirror( const DynRankView<T,P...> & src
   typedef DynRankView<T,P...>                   src_type ;
   typedef typename src_type::HostMirror  dst_type ;
 
-  Kokkos::LayoutStride layout ;
-
-  layout.dimension[0] = src.dimension(0);
-  layout.dimension[1] = src.dimension(1);
-  layout.dimension[2] = src.dimension(2);
-  layout.dimension[3] = src.dimension(3);
-  layout.dimension[4] = src.dimension(4);
-  layout.dimension[5] = src.dimension(5);
-  layout.dimension[6] = src.dimension(6);
-  layout.dimension[7] = src.dimension(7);
-
-  layout.stride[0] = src.stride(0);
-  layout.stride[1] = src.stride(1);
-  layout.stride[2] = src.stride(2);
-  layout.stride[3] = src.stride(3);
-  layout.stride[4] = src.stride(4);
-  layout.stride[5] = src.stride(5);
-  layout.stride[6] = src.stride(6);
-  layout.stride[7] = src.stride(7);
-
-  return dst_type( std::string( src.label() ).append("_mirror") , layout );
+  return dst_type( std::string( src.label() ).append("_mirror") 
+                 , Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src) {
+  return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
 }
 
 template< class T , class ... P >
@@ -997,6 +1739,22 @@ create_mirror_view( const DynRankView<T,P...> & src
   return Kokkos::Experimental::create_mirror( src ); 
 }
 
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src
+  , typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::DynRankView<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
+}
+
 } //end Experimental
 } //end Kokkos
 
@@ -1006,27 +1764,26 @@ create_mirror_view( const DynRankView<T,P...> & src
 
 namespace Kokkos {
 namespace Experimental {
-
 /** \brief  Resize a view with copying old data to new data at the corresponding indices. */
 template< class T , class ... P >
 inline
 void resize( DynRankView<T,P...> & v ,
-             const size_t n0 = 0 ,
-             const size_t n1 = 0 ,
-             const size_t n2 = 0 ,
-             const size_t n3 = 0 ,
-             const size_t n4 = 0 ,
-             const size_t n5 = 0 ,
-             const size_t n6 = 0 ,
-             const size_t n7 = 0 )
+             const size_t n0 = ~size_t(0) ,
+             const size_t n1 = ~size_t(0) ,
+             const size_t n2 = ~size_t(0) ,
+             const size_t n3 = ~size_t(0) ,
+             const size_t n4 = ~size_t(0) ,
+             const size_t n5 = ~size_t(0) ,
+             const size_t n6 = ~size_t(0) ,
+             const size_t n7 = ~size_t(0) )
 {
   typedef DynRankView<T,P...>  drview_type ;
 
   static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
 
-  drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
+  drview_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6 );
 
-  Kokkos::Experimental::Impl::ViewRemap< drview_type , drview_type >( v_resized , v );
+  Kokkos::Experimental::Impl::DynRankViewRemap< drview_type , drview_type >( v_resized, v );
 
   v = v_resized ;
 }
@@ -1035,29 +1792,30 @@ void resize( DynRankView<T,P...> & v ,
 template< class T , class ... P >
 inline
 void realloc( DynRankView<T,P...> & v ,
-              const size_t n0 = 0 ,
-              const size_t n1 = 0 ,
-              const size_t n2 = 0 ,
-              const size_t n3 = 0 ,
-              const size_t n4 = 0 ,
-              const size_t n5 = 0 ,
-              const size_t n6 = 0 ,
-              const size_t n7 = 0 )
+              const size_t n0 = ~size_t(0) ,
+              const size_t n1 = ~size_t(0) ,
+              const size_t n2 = ~size_t(0) ,
+              const size_t n3 = ~size_t(0) ,
+              const size_t n4 = ~size_t(0) ,
+              const size_t n5 = ~size_t(0) ,
+              const size_t n6 = ~size_t(0) ,
+              const size_t n7 = ~size_t(0) )
 {
-  typedef DynRankView<T,P...>  view_type ;
+  typedef DynRankView<T,P...>  drview_type ;
 
   static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
 
   const std::string label = v.label();
 
-  v = view_type(); // Deallocate first, if the only view to allocation
-  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+  v = drview_type(); // Deallocate first, if the only view to allocation
+  v = drview_type( label, n0, n1, n2, n3, n4, n5, n6 );
 }
 
 } //end Experimental
 
 } //end Kokkos
 
+using Kokkos::Experimental::is_dyn_rank_view ;
 
 namespace Kokkos {
 
@@ -1068,6 +1826,7 @@ using Kokkos::Experimental::deep_copy ;
 using Kokkos::Experimental::create_mirror ;
 using Kokkos::Experimental::create_mirror_view ;
 using Kokkos::Experimental::subdynrankview ;
+using Kokkos::Experimental::subview ;
 using Kokkos::Experimental::resize ;
 using Kokkos::Experimental::realloc ;
 
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index b1f9e95ed0..fb364f0bf2 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -77,10 +77,7 @@ private:
 
 public:
 
-  typedef Kokkos::Experimental::MemoryPool
-    < typename traits::memory_space
-    , typename traits::execution_space
-    > memory_pool ;
+  typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
 
 private:
 
@@ -338,7 +335,7 @@ public:
     void operator()( unsigned i ) const
       {
         if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
-          m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() );
+          m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
         }
         m_chunks[i] = 0 ;
       }
@@ -397,7 +394,7 @@ public:
     // The memory pool chunk is guaranteed to be a power of two
     , m_chunk_shift(
         Kokkos::Impl::integral_power_of_two(
-          m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) )
+          m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
     , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
     , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
     {
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 7de290e711..df2fbed5a6 100644
--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,6 +45,7 @@
 #define KOKKOS_BITSET_IMPL_HPP
 
 #include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_BitOps.hpp>
 #include <stdint.h>
 
 #include <cstdio>
@@ -52,122 +53,57 @@
 #include <iostream>
 #include <iomanip>
 
-namespace Kokkos { namespace Impl {
+namespace Kokkos {
+namespace Impl {
 
 KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_right(unsigned i, int r)
+unsigned rotate_right( unsigned i, int r )
 {
-  enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
-  return r ? ((i >> r) | (i << (size-r))) : i ;
+  enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
+  return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
 }
 
-KOKKOS_FORCEINLINE_FUNCTION
-int bit_scan_forward(unsigned i)
-{
-#if defined( __CUDA_ARCH__ )
-  return __ffs(i) - 1;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_ffs(i) - 1;
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_forward(i);
-#else
-
-  unsigned t = 1u;
-  int r = 0;
-  while (i && (i & t == 0))
-  {
-    t = t << 1;
-    ++r;
-  }
-  return r;
-#endif
-}
-
-
-KOKKOS_FORCEINLINE_FUNCTION
-int bit_scan_reverse(unsigned i)
-{
-  enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
-#if defined( __CUDA_ARCH__ )
-  return shift - __clz(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return shift - __builtin_clz(i);
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_reverse(i);
-#else
-  unsigned t = 1u << shift;
-  int r = 0;
-  while (i && (i & t == 0))
-  {
-    t = t >> 1;
-    ++r;
-  }
-  return r;
-#endif
-}
-
-
-// count the bits set
-KOKKOS_FORCEINLINE_FUNCTION
-int popcount(unsigned i)
-{
-#if defined( __CUDA_ARCH__ )
-  return __popc(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_popcount(i);
-#elif defined ( __INTEL_COMPILER )
-  return _popcnt32(i);
-#else
-  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
-  i = i - ((i >> 1) & ~0u/3u);                                         // temp
-  i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u);                      // temp
-  i = (i + (i >> 4)) & ~0u/255u*15u;                                   // temp
-  return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
-#endif
-}
-
-
-template <typename Bitset>
+template < typename Bitset >
 struct BitsetCount
 {
-  typedef Bitset bitset_type;
-  typedef typename bitset_type::execution_space::execution_space execution_space;
-  typedef typename bitset_type::size_type size_type;
-  typedef size_type value_type;
+  typedef Bitset                                                  bitset_type;
+  typedef typename bitset_type::execution_space::execution_space  execution_space;
+  typedef typename bitset_type::size_type                         size_type;
+  typedef size_type                                               value_type;
 
   bitset_type m_bitset;
 
-  BitsetCount( bitset_type const& bitset)
+  BitsetCount( bitset_type const& bitset )
     : m_bitset(bitset)
   {}
 
   size_type apply() const
   {
     size_type count = 0u;
-    parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count);
+    parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
     return count;
   }
 
   KOKKOS_INLINE_FUNCTION
-  static void init( value_type & count)
+  void init( value_type & count ) const
   {
     count = 0u;
   }
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & count, const volatile size_type & incr )
+  void join( volatile value_type & count, const volatile size_type & incr ) const
   {
     count += incr;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type i, value_type & count) const
+  void operator()( size_type i, value_type & count ) const
   {
-    count += popcount(m_bitset.m_blocks[i]);
+    count += bit_count( m_bitset.m_blocks[i] );
   }
 };
 
-}} //Kokkos::Impl
+} // namespace Impl
+} // namespace Kokkos
 
 #endif // KOKKOS_BITSET_IMPL_HPP
-
diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index 3c4aed7bec..e71ccc0091 100644
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -713,13 +713,20 @@ public:
   typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;
 
   typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
-  typedef typename dView0::host_mirror_space host ;
+  typedef typename dView0::host_mirror_space host_drv_space ;
+
+  typedef Kokkos::Experimental::View< T , device >        View0 ;
+  typedef Kokkos::Experimental::View< T* , device >       View1 ;
+  typedef Kokkos::Experimental::View< T******* , device > View7 ;
+
+  typedef typename View0::host_mirror_space  host_view_space ;
 
   TestDynViewAPI()
   {
+    run_test_resize_realloc();
     run_test_mirror();
-    run_test();
     run_test_scalar();
+    run_test();
     run_test_const();
     run_test_subview();
     run_test_subview_strided();
@@ -735,19 +742,147 @@ public:
     TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
   }
 
+  static void run_test_resize_realloc()
+  {
+    dView0 drv0("drv0", 10, 20, 30);
+    ASSERT_EQ( drv0.rank(), 3);
+
+    Kokkos::Experimental::resize(drv0, 5, 10);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.dimension_0(), 5);
+    ASSERT_EQ( drv0.dimension_1(), 10);
+    ASSERT_EQ( drv0.dimension_2(), 1);
+
+    Kokkos::Experimental::realloc(drv0, 10, 20);
+    ASSERT_EQ( drv0.rank(), 2);
+    ASSERT_EQ( drv0.dimension_0(), 10);
+    ASSERT_EQ( drv0.dimension_1(), 20);
+    ASSERT_EQ( drv0.dimension_2(), 1);
+
+  }
+
   static void run_test_mirror()
   {
-    typedef Kokkos::Experimental::DynRankView< int , host > view_type ;
+    typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
     typedef typename view_type::HostMirror mirror_type ;
     view_type a("a");
     mirror_type am = Kokkos::Experimental::create_mirror_view(a);
     mirror_type ax = Kokkos::Experimental::create_mirror(a);
     ASSERT_EQ( & a() , & am() );
+    ASSERT_EQ( a.rank() , am.rank() );
+    ASSERT_EQ( ax.rank() , am.rank() );
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+  
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
+      int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
+      int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
+  
+      ASSERT_EQ(equal_ptr_h_h2,0);
+      ASSERT_EQ(equal_ptr_h_d ,0);
+      ASSERT_EQ(equal_ptr_h2_d,0);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
+
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    } 
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    } 
+    if (Kokkos::HostSpace::execution_space::is_initialized() )
+    {
+      typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
+      unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
+      view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+      auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
+      auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
+  
+      int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
+      int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
+      int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
+  
+      int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0; 
+      ASSERT_EQ(equal_ptr_h_h2,1);
+      ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
+      ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+  
+      ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
+      ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
+
+      ASSERT_EQ(a_h.rank(),a_h2.rank());
+      ASSERT_EQ(a_h.rank(),a_d.rank());
+    }
   }
 
   static void run_test_scalar()
   {
-    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView0::HostMirror  hView0 ; //HostMirror of DynRankView is a DynRankView
 
     dView0 dx , dy ;
     hView0 hx , hy ;
@@ -765,6 +900,79 @@ public:
     Kokkos::Experimental::deep_copy( hy , dy );
 
     ASSERT_EQ( hx(), hy() );
+    ASSERT_EQ( dx.rank() , hx.rank() );
+    ASSERT_EQ( dy.rank() , hy.rank() );
+
+  //View - DynRankView Interoperability tests
+  // deep_copy DynRankView to View
+    View0 vx("vx");
+    Kokkos::deep_copy( vx , dx );
+    ASSERT_EQ( rank(dx) , rank(vx) );
+
+    View0 vy("vy");
+    Kokkos::deep_copy( vy , dy );
+    ASSERT_EQ( rank(dy) , rank(vy) );
+
+  // deep_copy View to DynRankView 
+    dView0 dxx("dxx");
+    Kokkos::deep_copy( dxx , vx );
+    ASSERT_EQ( rank(dxx) , rank(vx) );
+
+
+    View7 vcast = dx.ConstDownCast();
+    ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
+    ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
+    ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
+    ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
+    ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
+
+    View7 vcast1( dy.ConstDownCast() );
+    ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
+    ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
+    ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
+    ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
+    ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
+
+  //View - DynRankView Interoperability tests
+  // copy View to DynRankView
+    dView0 dfromvx( vx );
+    auto hmx = Kokkos::create_mirror_view(dfromvx) ;
+    Kokkos::deep_copy(hmx , dfromvx);
+    auto hvx = Kokkos::create_mirror_view(vx) ;
+    Kokkos::deep_copy(hvx , vx);
+    ASSERT_EQ( rank(hvx) , rank(hmx) );
+    ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
+    ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
+
+  // copy-assign View to DynRankView
+    dView0 dfromvy = vy ;
+    auto hmy = Kokkos::create_mirror_view(dfromvy) ;
+    Kokkos::deep_copy(hmy , dfromvy);
+    auto hvy = Kokkos::create_mirror_view(vy) ;
+    Kokkos::deep_copy(hvy , vy);
+    ASSERT_EQ( rank(hvy) , rank(hmy) );
+    ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
+    ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
+
+
+    View7 vtest1("vtest1",2,2,2,2,2,2,2);
+    dView0 dfromv1( vtest1 );
+    ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
+    ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
+    ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
+    ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
+
+    dView0 dfromv2( vcast );
+    ASSERT_EQ( dfromv2.rank() , vcast.Rank );
+    ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
+    ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
+    ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
+
+    dView0 dfromv3 = vcast1;
+    ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
+    ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
+    ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
+    ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
   }
 
   static void run_test()
@@ -782,22 +990,32 @@ public:
       (void) thing;
     }
 
+    dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
+    ASSERT_TRUE( d_uninitialized.data() != nullptr );
+    ASSERT_EQ( d_uninitialized.rank() , 2 );
+    ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
+    ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
+    ASSERT_EQ( d_uninitialized.dimension_2() , 1  );
+
     dView0 dx , dy , dz ;
     hView0 hx , hy , hz ;
 
-    ASSERT_TRUE( dx.ptr_on_device() == 0 );
-    ASSERT_TRUE( dy.ptr_on_device() == 0 );
-    ASSERT_TRUE( dz.ptr_on_device() == 0 );
+    ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
+    ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
+
+    ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
+    ASSERT_TRUE( dy.ptr_on_device() == 0 );  //Okay with UVM
+    ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
     ASSERT_TRUE( hx.ptr_on_device() == 0 );
     ASSERT_TRUE( hy.ptr_on_device() == 0 );
     ASSERT_TRUE( hz.ptr_on_device() == 0 );
-    ASSERT_EQ( dx.dimension_0() , 0u );
-    ASSERT_EQ( dy.dimension_0() , 0u );
-    ASSERT_EQ( dz.dimension_0() , 0u );
+    ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
+    ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
     ASSERT_EQ( hx.dimension_0() , 0u );
     ASSERT_EQ( hy.dimension_0() , 0u );
     ASSERT_EQ( hz.dimension_0() , 0u );
-    ASSERT_EQ( dx.rank() , 0u );
+    ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
     ASSERT_EQ( hx.rank() , 0u );
 
     dx = dView0( "dx" , N1 , N2 , N3 );
@@ -806,11 +1024,11 @@ public:
     hx = hView0( "hx" , N1 , N2 , N3 );
     hy = hView0( "hy" , N1 , N2 , N3 );
 
-    ASSERT_EQ( dx.dimension_0() , unsigned(N1) );
-    ASSERT_EQ( dy.dimension_0() , unsigned(N1) );
+    ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
+    ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
     ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
     ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
-    ASSERT_EQ( dx.rank() , 3 );
+    ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
     ASSERT_EQ( hx.rank() , 3 );
 
     dx = dView0( "dx" , N0 , N1 , N2 , N3 );
@@ -823,19 +1041,23 @@ public:
     ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
     ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
     ASSERT_EQ( dx.rank() , 4 );
+    ASSERT_EQ( dy.rank() , 4 );
     ASSERT_EQ( hx.rank() , 4 );
+    ASSERT_EQ( hy.rank() , 4 );
 
     ASSERT_EQ( dx.use_count() , size_t(1) );
 
     dView0_unmanaged unmanaged_dx = dx;
     ASSERT_EQ( dx.use_count() , size_t(1) );
 
+
     dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
                                                               dx.dimension_0(),
                                                               dx.dimension_1(),
                                                               dx.dimension_2(),
                                                               dx.dimension_3());
 
+
     {
       // Destruction of this view should be harmless
       const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
@@ -888,6 +1110,19 @@ public:
     hx = Kokkos::Experimental::create_mirror( dx );
     hy = Kokkos::Experimental::create_mirror( dy );
 
+    ASSERT_EQ( hx.rank() , dx.rank() );
+    ASSERT_EQ( hy.rank() , dy.rank() );
+
+    ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
+
     // T v1 = hx() ;    // Generates compile error as intended
     // T v2 = hx(0,0) ; // Generates compile error as intended
     // hx(0,0) = v2 ;   // Generates compile error as intended
@@ -990,7 +1225,9 @@ public:
       for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
         { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
       }}}}
+//    ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
     }
+
     dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
     dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
 
@@ -1006,6 +1243,35 @@ public:
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
     ASSERT_TRUE( dz.ptr_on_device() == 0 );
+
+  //View - DynRankView Interoperability tests
+    // deep_copy from view to dynrankview
+    const int testdim = 4;
+    dView0 dxx("dxx",testdim);
+    View1  vxx("vxx",testdim);
+    auto hvxx = Kokkos::create_mirror_view(vxx); 
+    for (int i = 0; i < testdim; ++i)
+      { hvxx(i) = i; }
+    Kokkos::deep_copy(vxx,hvxx);
+    Kokkos::deep_copy(dxx,vxx);
+    auto hdxx = Kokkos::create_mirror_view(dxx);
+    Kokkos::deep_copy(hdxx,dxx);
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hdxx(i) ); }
+
+    ASSERT_EQ( rank(hdxx) , rank(hvxx) );
+    ASSERT_EQ( hdxx.dimension_0() , testdim );
+    ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
+
+    // deep_copy from dynrankview to view
+    View1 vdxx("vdxx",testdim);
+    auto hvdxx = Kokkos::create_mirror_view(vdxx);
+    Kokkos::deep_copy(hvdxx , hdxx);
+    ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
+    ASSERT_EQ( hvdxx.dimension_0() , testdim );
+    ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
+    for (int i = 0; i < testdim; ++i)
+      { ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
   }
 
   typedef T DataType ;
@@ -1059,35 +1325,66 @@ public:
   //  N0 = 1000,N1 = 3,N2 = 5,N3 = 7 
     unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
     sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
+    ASSERT_EQ( d7.rank() , 7 );
 
-    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview
+    sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); 
+    ASSERT_EQ( ds0.rank() , 0 );
 
 //Basic test - ALL
-    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs
+    sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); 
+    ASSERT_EQ( dsALL.rank() , 7 );
 
-//  Send a single value for one rank
+//  Send a value to final rank returning rank 6 subview
     sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
+    ASSERT_EQ( dsm1.rank() , 6 );
 
-//  Send a std::pair as a rank
+//  Send a std::pair as argument to a rank
     sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
+    ASSERT_EQ( dssp.rank() , 7 );
 
-//  Send a kokkos::pair as a rank; take default layout as input
+//  Send a kokkos::pair as argument to a rank; take default layout as input
     dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
+    ASSERT_EQ( dd0.rank() , 7 );
     sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( dtkp.rank() , 7 );
 
 // Return rank 7 subview, taking a pair as one argument, layout stride input
     sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds7.rank() , 7 );
 
 // Default Layout DynRankView
     dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
+    ASSERT_EQ( dv6.rank() , 6 );
 
 // DynRankView with LayoutRight
     typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
     drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
+    ASSERT_EQ( dr5.rank() , 5 );
 
 // LayoutStride but arranged as LayoutRight
-    unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 };
-    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) );
+  // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that 
+  //  rank deduction can properly take place
+    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+    Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
+    ls.dimension[5] = ~size_t(0);
+    ls.dimension[6] = ~size_t(0);
+    ls.dimension[7] = ~size_t(0);
+    sdView d5("d5", ls);
+    ASSERT_EQ( d5.rank() , 5 );
+
+//  LayoutStride arranged as LayoutRight - commented out as example that fails unit test
+//    unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
+//    sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
+//
+//  Fails the following unit test:
+//    ASSERT_EQ( d5.rank() , dr5.rank() );
+//
+//  Explanation: In construction of the Kokkos::LayoutStride below, since the 
+//   remaining dimensions are not specified, they will default to values of 0 
+//   rather than ~size_t(0). 
+//  When passed to the DynRankView constructor the default dimensions (of 0) 
+//   will be counted toward the dynamic rank and returning an incorrect value 
+//   (i.e. rank 7 rather than 5).
 
 // Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) 
     ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
@@ -1100,21 +1397,21 @@ public:
 
 // Rank 5 subview of rank 5 dynamic rank view, layout stride input
     sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
+    ASSERT_EQ( ds5.rank() , 5 );
 
 // Pass in extra ALL arguments beyond the rank of the DynRank View.
 // This behavior is allowed - ignore the extra ALL arguments when
 //  the src.rank() < number of arguments, but be careful!
     sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
 
+    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
     ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
     ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
     ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
-    ASSERT_EQ( ds5.rank() , ds5plus.rank() );
-    ASSERT_EQ( ds5.rank() , 5 );
 
 #if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
-    ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 );
     ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
+    ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 );  // passing argument to rank beyond the view's rank is allowed iff it is a 0. 
 #endif
 
 // Similar test to rank 5 above, but create rank 4 subview
@@ -1131,9 +1428,9 @@ public:
 
   static void run_test_subview_strided()
   {
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ;
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ;
-    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
+    typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
 
     drview_left  xl2( "xl2", 100 , 200 );
     drview_right xr2( "xr2", 100 , 200 );
@@ -1159,35 +1456,37 @@ public:
     drview_left  xl4( "xl4", 10 , 20 , 30 , 40 );
     drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
 
-    drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
-    drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    //Replace subdynrankview with subview - test
+    drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
 
     ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
     ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
     ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
     ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
+    ASSERT_EQ( yl4.rank() , 2);
+    ASSERT_EQ( yr4.rank() , 2);
 
     ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
     ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
-
   }
 
   static void run_test_vector()
   {
     static const unsigned Length = 1000 , Count = 8 ;
 
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ; 
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ; 
 
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
 
     multivector_type mv = multivector_type( "mv" , Length , Count );
     multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
 
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ;
-    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ; 
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
+    typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
 
     svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
     svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
@@ -1251,7 +1550,6 @@ public:
     const_smultivector_type cmv( mv );
     typename smultivector_type::const_type cmvX( cmv );
     typename const_smultivector_type::const_type ccmvX( cmv );
-
   }
 };
 
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
index fd37f16f0a..7e3ca005f4 100644
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -61,8 +61,7 @@ struct TestDynamicView
   typedef typename Space::execution_space  execution_space ;
   typedef typename Space::memory_space     memory_space ;
 
-  typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space >
-    memory_pool_type ;
+  typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
 
   typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
 
@@ -129,11 +128,9 @@ struct TestDynamicView
     typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
     typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;
 
-    const unsigned int chunk_size = 1024 ;
-
 // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
 
-    memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) );
+    memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );
 
 // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);
 
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
index 961ad58ec5..27e3ba1c31 100644
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@@ -34,6 +34,7 @@
 #cmakedefine KOKKOS_HAVE_Winthread
 #cmakedefine KOKKOS_HAVE_OPENMP
 #cmakedefine KOKKOS_HAVE_HWLOC
+#cmakedefine KOKKOS_HAVE_DEBUG
 #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
 #cmakedefine KOKKOS_HAVE_CXX11
 #cmakedefine KOKKOS_HAVE_CUSPARSE
diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt
index 34aa81e92c..d93ca14d96 100644
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@@ -8,11 +8,22 @@ SET(SOURCES
   PerfTestCuda.cpp
   )
 
+# Per #374, we always want to build this test, but we only want to run
+# it as a PERFORMANCE test.  That's why we separate building the test
+# from running the test.
+
+TRIBITS_ADD_EXECUTABLE(
+  PerfTestExec
+  SOURCES ${SOURCES}
+  COMM serial mpi
+  TESTONLYLIBS kokkos_gtest
+  )
+
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   PerfTest
-  SOURCES ${SOURCES}
+  NAME PerfTestExec
   COMM serial mpi
   NUM_MPI_PROCS 1
+  CATEGORIES PERFORMANCE
   FAIL_REGULAR_EXPRESSION "  FAILED  "
-  TESTONLYLIBS kokkos_gtest
   )
diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
index 4a4bc13cd4..524beb8b90 100644
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@@ -159,7 +159,7 @@ struct TextureFetch
 
     Kokkos::Cuda::fence();
 
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
     for (int j=0; j<10; ++j) {
       RandomReduce f(array,indexes);
       f.apply(reduce);
diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
index 03805dcdf5..516696b141 100644
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
@@ -153,7 +153,7 @@ struct ModifiedGramSchmidt
 
     Kokkos::deep_copy( one , (Scalar) 1 );
 
-    Kokkos::Impl::Timer timer ;
+    Kokkos::Timer timer ;
 
     for ( size_type j = 0 ; j < count ; ++j ) {
       // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
index d13d9a49e8..ed5371f29c 100644
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
@@ -252,7 +252,7 @@ struct HexGrad
     execution_space::fence();
 
     for ( int i = 0 ; i < iter ; ++i ) {
-      Kokkos::Impl::Timer timer ;
+      Kokkos::Timer timer ;
       Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
       execution_space::fence();
       const double dt = timer.seconds();
diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp
index 882a5c615e..ab73f2505e 100644
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) {
 
   Kokkos::Impl::Timer timer;
   T res = LoopVariant<T>(loop,test);
-  double time1 = timer.seconds();
+  double time = timer.seconds();
 
   timer.reset();
   T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
-  double time2 = timer.seconds();
+  double timeNonAtomic = timer.seconds();
 
   timer.reset();
   T resSerial = LoopVariantSerial<T>(loop,test);
-  double time3 = timer.seconds();
+  double timeSerial = timer.seconds();
 
-  time1*=1e6/loop;
-  time2*=1e6/loop;
-  time3*=1e6/loop;
+  time         *=1e6/loop;
+  timeNonAtomic*=1e6/loop;
+  timeSerial   *=1e6/loop;
   //textcolor_standard();
   bool passed = true;
   if(resSerial!=res) passed = false;
   //if(!passed) textcolor(RESET,BLACK,YELLOW);
-  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
+         type_name,test,passed?"PASSED":"FAILED",loop,
+         1.0*resSerial,1.0*res,1.0*resNonAtomic,
+         timeSerial,time,timeNonAtomic,(int)sizeof(T));
   //if(!passed) textcolor_standard();
   printf("\n");
 }
@@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) {
 int main(int argc, char* argv[])
 {
   int type = -1;
-  int loop = 1000000;
+  int loop = 100000;
   int test = -1;
 
   for(int i=0;i<argc;i++)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index 9930cdf1ba..d1a560ee04 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
 
 #endif
 
+
+namespace Kokkos {
+namespace Impl {
+  struct CudaLockArraysStruct {
+    int* atomic;
+    int* scratch;
+    int* threadid;
+  };
+}
+}
 __device__ __constant__
 #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
 extern
 #endif
-int* kokkos_impl_cuda_atomic_lock_array ;
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
 
 #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
 #define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
 
+namespace Kokkos {
+namespace Impl {
+  void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
+}
+}
+
 namespace Kokkos {
 namespace Impl {
 __device__ inline
@@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) {
   size_t offset = size_t(ptr);
   offset = offset >> 2;
   offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
-  return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
+  return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
 }
 
 __device__ inline
@@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) {
   size_t offset = size_t(ptr);
   offset = offset >> 2;
   offset = offset & CUDA_SPACE_ATOMIC_MASK;
-  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
-  atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
+  atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
 }
 
 }
@@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > {
       cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
 
       #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-      int* lock_array_ptr = lock_array_cuda_space_ptr();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+      Kokkos::Impl::CudaLockArraysStruct locks;
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
       #endif
 
       // Invoke the driver function on the device
@@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > {
       #endif
 
       #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-      int* lock_array_ptr = lock_array_cuda_space_ptr();
-      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+      Kokkos::Impl::CudaLockArraysStruct locks;
+      locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+      locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+      locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
       #endif
 
       cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 863488c3b0..a4f372d65d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -51,10 +51,10 @@
 /* only compile this file if CUDA is enabled for Kokkos */
 #ifdef KOKKOS_HAVE_CUDA
 
+#include <Kokkos_Core.hpp>
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_CudaSpace.hpp>
 
-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>
 
@@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
 
 namespace Kokkos {
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace {
-
-void texture_object_attach_impl(  Impl::AllocationTracker const & tracker
-                                , unsigned type_size
-                                , ::cudaChannelFormatDesc const & desc
-                               )
-{
-  enum { TEXTURE_BOUND_1D = 2u << 27 };
-
-  if ( tracker.attribute() == NULL ) {
-    // check for correct allocator
-    const bool ok_alloc =  tracker.allocator()->support_texture_binding();
-
-    const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
-
-    if (ok_alloc && ok_count) {
-      Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
-      tracker.set_attribute( attr );
-    }
-    else {
-      std::ostringstream oss;
-      oss << "Error: Cannot attach texture object";
-      if (!ok_alloc) {
-        oss << ", incompatabile allocator " << tracker.allocator()->name();
-      }
-      if (!ok_count) {
-        oss << ", array " << tracker.label() << " too large";
-      }
-      oss << ".";
-      Kokkos::Impl::throw_runtime_exception( oss.str() );
-    }
-  }
-
-  if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
-    std::ostringstream oss;
-    oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
-    Kokkos::Impl::throw_runtime_exception( oss.str() );
-  }
-
-}
-
-} // unnamed namespace
-
-/*--------------------------------------------------------------------------*/
-
-Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label);
-}
-
-void CudaSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
-                                      , unsigned type_size
-                                      , ::cudaChannelFormatDesc const & desc
-                                     )
-{
-  texture_object_attach_impl( tracker, type_size, desc );
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 void CudaSpace::access_error()
 {
   const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
@@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const )
 
 /*--------------------------------------------------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label);
-}
-
-void CudaUVMSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
-                                         , unsigned type_size
-                                         , ::cudaChannelFormatDesc const & desc
-                                        )
-{
-  texture_object_attach_impl( tracker, type_size, desc );
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 bool CudaUVMSpace::available()
 {
 #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
@@ -212,15 +133,6 @@ bool CudaUVMSpace::available()
 
 /*--------------------------------------------------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label);
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 } // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
 
 namespace Kokkos {
 namespace {
-  __global__ void init_lock_array_kernel() {
+  __global__ void init_lock_array_kernel_atomic() {
     unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
 
     if(i<CUDA_SPACE_ATOMIC_MASK+1)
-      kokkos_impl_cuda_atomic_lock_array[i] = 0;
+      kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
+  }
+
+  __global__ void init_lock_array_kernel_scratch_threadid(int N) {
+    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i<N) {
+      kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
+      kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
+    }
   }
 }
 
+
 namespace Impl {
-int* lock_array_cuda_space_ptr(bool deallocate) {
+int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
   static int* ptr = NULL;
   if(deallocate) {
     cudaFree(ptr);
@@ -845,15 +767,62 @@ int* lock_array_cuda_space_ptr(bool deallocate) {
   return ptr;
 }
 
-void init_lock_array_cuda_space() {
-  int is_initialized = 0;
+int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
+  return ptr;
+}
+
+int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
+  return ptr;
+}
+
+void init_lock_arrays_cuda_space() {
+  static int is_initialized = 0;
   if(! is_initialized) {
-    int* lock_array_ptr = lock_array_cuda_space_ptr();
-    cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
-    init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+    Kokkos::Impl::CudaLockArraysStruct locks;
+    locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+    locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+    locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+    cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
+    init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+    init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
   }
 }
 
+void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
+  static void* ptr = NULL;
+  static size_t current_size = 0;
+  if(current_size == 0) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  if(bytes > current_size) {
+    current_size = bytes;
+    ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
+  }
+  if((bytes < current_size) && (force_shrink)) {
+    current_size = bytes;
+    Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
+    ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
+  }
+  return ptr;
+}
+
 }
 }
 #endif // KOKKOS_HAVE_CUDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
index 5746176274..10999ee57b 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@@ -50,7 +50,6 @@
 #ifdef KOKKOS_HAVE_CUDA
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
 
 namespace Kokkos {
 namespace Impl {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
deleted file mode 100644
index 05c73121bc..0000000000
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_HAVE_CUDA
-
-#include <impl/Kokkos_Error.hpp>
-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>
-
-#include <sstream>
-
-namespace Kokkos { namespace Impl {
-
-
-/*--------------------------------------------------------------------------*/
-
-TextureAttribute::TextureAttribute(  void * const alloc_ptr
-                                   , size_t alloc_size
-                                   , cudaChannelFormatDesc const & desc
-                                  )
-  : m_tex_obj(0)
-{
-  cuda_device_synchronize();
-
-  struct cudaResourceDesc resDesc ;
-  struct cudaTextureDesc  texDesc ;
-
-  memset( & resDesc , 0 , sizeof(resDesc) );
-  memset( & texDesc , 0 , sizeof(texDesc) );
-
-  resDesc.resType                = cudaResourceTypeLinear ;
-  resDesc.res.linear.desc        = desc ;
-  resDesc.res.linear.sizeInBytes = alloc_size ;
-  resDesc.res.linear.devPtr      = alloc_ptr ;
-
-  CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
-
-  cuda_device_synchronize();
-}
-
-
-TextureAttribute::~TextureAttribute()
-{
-  if (m_tex_obj) {
-    cudaDestroyTextureObject( m_tex_obj );
-  }
-}
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaMallocAllocator::allocate( size_t size )
-{
-  void * ptr = NULL;
-
-  CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
-
-  return ptr;
-}
-
-void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  try {
-    CUDA_SAFE_CALL( cudaFree( ptr ) );
-  } catch(...) {}
-}
-
-void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-    size_t copy_size = old_size < new_size ? old_size : new_size;
-
-    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
-
-    deallocate( old_ptr, old_size );
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaUVMAllocator::allocate( size_t size )
-{
-#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
-  void * ptr = NULL;
-  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
-  return ptr;
-#else
-  throw_runtime_exception( "CUDA VERSION does not support UVM" );
-  return NULL;
-#endif
-}
-
-void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  try {
-    CUDA_SAFE_CALL( cudaFree( ptr ) );
-  } catch(...) {}
-}
-
-void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-    size_t copy_size = old_size < new_size ? old_size : new_size;
-
-    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
-
-    deallocate( old_ptr, old_size );
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaHostAllocator::allocate( size_t size )
-{
-  void * ptr = NULL;
-  CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
-  return ptr;
-}
-
-void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  try {
-    CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
-  } catch(...) {}
-}
-
-void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-    size_t copy_size = old_size < new_size ? old_size : new_size;
-
-    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
-
-    deallocate( old_ptr, old_size );
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-}} // namespace Kokkos::Impl
-
-#endif //KOKKOS_HAVE_CUDA
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
deleted file mode 100644
index 80bc986ad7..0000000000
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
-#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-/* only compile this file if CUDA is enabled for Kokkos */
-#ifdef KOKKOS_HAVE_CUDA
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
-
-namespace Kokkos { namespace Impl {
-
-
-// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
-// to be an 'unsigned long long'.  This chould change with
-// future version of Cuda and this typedef would have to
-// change accordingly.
-
-#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
-
-typedef enable_if<
-  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
-  ::cudaTextureObject_t >::type cuda_texture_object_type ;
-
-#else
-
-typedef const void * cuda_texture_object_type ;
-
-#endif
-
-
-struct TextureAttribute : public AllocatorAttributeBase
-{
-  cuda_texture_object_type m_tex_obj ;
-
-  TextureAttribute(  void * const alloc_ptr
-                   , size_t alloc_size
-                   , cudaChannelFormatDesc const & desc
-                  );
-
-  ~TextureAttribute();
-};
-
-/// class CudaUnmanagedAllocator
-/// does nothing when deallocate(ptr,size) is called
-struct CudaUnmanagedAllocator
-{
-  static const char * name()
-  {
-    return "Cuda Unmanaged Allocator";
-  }
-
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaUnmanagedAllocator
-/// does nothing when deallocate(ptr,size) is called
-struct CudaUnmanagedUVMAllocator
-{
-  static const char * name()
-  {
-    return "Cuda Unmanaged UVM Allocator";
-  }
-
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaUnmanagedHostAllocator
-/// does nothing when deallocate(ptr,size) is called
-class CudaUnmanagedHostAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda Unmanaged Host Allocator";
-  }
-  // Unmanaged deallocate does nothing
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-};
-
-/// class CudaMallocAllocator
-class CudaMallocAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda Malloc Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaUVMAllocator
-class CudaUVMAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda UVM Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-
-  static bool support_texture_binding() { return true; }
-};
-
-/// class CudaHostAllocator
-class CudaHostAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Cuda Host Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-};
-
-
-}} // namespace Kokkos::Impl
-
-#endif //KOKKOS_HAVE_CUDA
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
index 02c85d268c..2d8d07d077 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -51,8 +51,8 @@
 
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
 
 /*--------------------------------------------------------------------------*/
 /* Standard 'C' libraries */
@@ -70,7 +70,7 @@ __device__ __constant__
 unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
 
 __device__ __constant__
-int* kokkos_impl_cuda_atomic_lock_array ;
+Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
 
 #endif
 
@@ -190,7 +190,7 @@ namespace {
 
 class CudaInternalDevices {
 public:
-  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  enum { MAXIMUM_DEVICE_COUNT = 64 };
   struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
   int                    m_cudaDevCount ;
 
@@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices()
 
   CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
 
+  if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
+    Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+  }
   for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
     CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
   }
@@ -226,14 +229,6 @@ private:
   CudaInternal( const CudaInternal & );
   CudaInternal & operator = ( const CudaInternal & );
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-  AllocationTracker m_scratchFlagsTracker;
-  AllocationTracker m_scratchSpaceTracker;
-  AllocationTracker m_scratchUnifiedTracker;
-
-#endif
-
 
 public:
 
@@ -255,6 +250,8 @@ public:
   size_type * m_scratchUnified ;
   cudaStream_t * m_stream ;
 
+  static int was_initialized;
+  static int was_finalized;
 
   static CudaInternal & singleton();
 
@@ -293,6 +290,8 @@ public:
   size_type * scratch_unified( const size_type size );
 };
 
+int CudaInternal::was_initialized = 0;
+int CudaInternal::was_finalized = 0;
 //----------------------------------------------------------------------------
 
 
@@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton()
 
 void CudaInternal::initialize( int cuda_device_id , int stream_count )
 {
+  if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
+  was_initialized = 1;
+  if ( is_initialized() ) return;
+
   enum { WordSize = sizeof(size_type) };
 
   if ( ! HostSpace::execution_space::is_initialized() ) {
@@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
   cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
 
   // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_cuda_space();
+  Impl::init_lock_arrays_cuda_space();
 
   #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
-  int* lock_array_ptr = lock_array_cuda_space_ptr();
-  cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+  Kokkos::Impl::CudaLockArraysStruct locks;
+  locks.atomic = atomic_lock_array_cuda_space_ptr(false);
+  locks.scratch = scratch_lock_array_cuda_space_ptr(false);
+  locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+  cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
   #endif
 }
 
@@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
 
     m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
-
-    m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
-
-#else
-
     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
 
     Record * const r = Record::allocate( Kokkos::CudaSpace()
@@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
 
     m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
 
-#endif
-
-
     CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
   }
 
@@ -582,26 +577,15 @@ CudaInternal::scratch_space( const Cuda::size_type size )
 
     m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
-
-    m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
-
-#else
-
-    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
-
-    Record * const r = Record::allocate( Kokkos::CudaSpace()
-                                       , "InternalScratchSpace"
-                                       , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
-
-    Record::increment( r );
+     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
 
-    m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
+     Record * const r = Record::allocate( Kokkos::CudaSpace()
+                                        , "InternalScratchSpace"
+                                        , ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
 
-#endif
+     Record::increment( r );
 
+     m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
   }
 
   return m_scratchSpace ;
@@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
 
     m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
-
-    m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
-
-#else
-
     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
 
     Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
@@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
     Record::increment( r );
 
     m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
-
-#endif
-
   }
 
   return m_scratchUnified ;
@@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
 
 void CudaInternal::finalize()
 {
+  was_finalized = 1;
   if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
 
-    lock_array_cuda_space_ptr(true);
+    atomic_lock_array_cuda_space_ptr(false);
+    scratch_lock_array_cuda_space_ptr(false);
+    threadid_lock_array_cuda_space_ptr(false);
+
     if ( m_stream ) {
       for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
         cudaStreamDestroy( m_stream[i] );
@@ -655,14 +632,6 @@ void CudaInternal::finalize()
       ::free( m_stream );
     }
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-    m_scratchSpaceTracker.clear();
-    m_scratchFlagsTracker.clear();
-    m_scratchUnifiedTracker.clear();
-
-#else
-
     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
 
@@ -670,8 +639,6 @@ void CudaInternal::finalize()
     RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
     RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
 
-#endif
-
     m_cudaDev             = -1 ;
     m_multiProcCount      = 0 ;
     m_maxWarpCount        = 0 ;
@@ -730,7 +697,13 @@ int Cuda::is_initialized()
 { return Impl::CudaInternal::singleton().is_initialized(); }
 
 void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
-{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
+{
+  Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
 
 std::vector<unsigned>
 Cuda::detect_device_arch()
@@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch()
 }
 
 void Cuda::finalize()
-{ Impl::CudaInternal::singleton().finalize(); }
+{
+  Impl::CudaInternal::singleton().finalize();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
+}
 
 Cuda::Cuda()
   : m_device( Impl::CudaInternal::singleton().m_cudaDev )
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
index 328857d997..8b10d47f88 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -57,17 +57,20 @@ template<class DriverType, bool Large>
 struct CudaGetMaxBlockSize;
 
 template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
-int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
-  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
+int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
 }
 
 
 template<class DriverType>
 struct CudaGetMaxBlockSize<DriverType,true> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int numBlocks;
     int blockSize=32;
-    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
         cuda_parallel_launch_constant_memory<DriverType>,
@@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> {
 
     while (blockSize<1024 && numBlocks>0) {
       blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length);
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
 
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
           &numBlocks,
@@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> {
 
 template<class DriverType>
 struct CudaGetMaxBlockSize<DriverType,false> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int numBlocks;
 
     int blockSize=32;
-    int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &numBlocks,
         cuda_parallel_launch_local_memory<DriverType>,
@@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> {
 
     while (blockSize<1024 && numBlocks>0) {
       blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
 
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
           &numBlocks,
@@ -123,13 +130,15 @@ template<class DriverType, bool Large>
 struct CudaGetOptBlockSize;
 
 template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
-int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
-  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
 }
 
 template<class DriverType>
 struct CudaGetOptBlockSize<DriverType,true> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int blockSize=16;
     int numBlocks;
     int sharedmem;
@@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
       blockSize*=2;
 
       //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
               &numBlocks,
               cuda_parallel_launch_constant_memory<DriverType>,
@@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
 
 template<class DriverType>
 struct CudaGetOptBlockSize<DriverType,false> {
-  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
     int blockSize=16;
     int numBlocks;
     int sharedmem;
@@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> {
 
     while(blockSize<1024) {
       blockSize*=2;
-      sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
 
       cudaOccupancyMaxActiveBlocksPerMultiprocessor(
               &numBlocks,
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 99296dd273..7afa06fdf5 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -99,13 +99,13 @@ public:
 
   __device__ inline
   const execution_space::scratch_memory_space & team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
   __device__ inline
-  const execution_space::scratch_memory_space & team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+  const execution_space::scratch_memory_space & team_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,1,0) ; }
   __device__ inline
-  const execution_space::scratch_memory_space & thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+  const execution_space::scratch_memory_space & thread_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
 
   __device__ inline int league_rank() const { return m_league_rank ; }
   __device__ inline int league_size() const { return m_league_size ; }
@@ -122,6 +122,7 @@ public:
     }
     team_barrier();
     value = sh_val;
+    team_barrier();
   }
 
 #ifdef KOKKOS_HAVE_CXX11
@@ -203,10 +204,12 @@ public:
   CudaTeamMember( void * shared
                 , const int shared_begin
                 , const int shared_size
+                , void*     scratch_level_1_ptr
+                , const int scratch_level_1_size
                 , const int arg_league_rank
                 , const int arg_league_size )
     : m_team_reduce( shared )
-    , m_team_shared( ((char *)shared) + shared_begin , shared_size )
+    , m_team_shared( ((char *)shared) + shared_begin , shared_size,  scratch_level_1_ptr, scratch_level_1_size)
     , m_league_rank( arg_league_rank ) 
     , m_league_size( arg_league_size ) 
     {}
@@ -214,11 +217,11 @@ public:
 #else
 
   const execution_space::scratch_memory_space & team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
-  const execution_space::scratch_memory_space & team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
-  const execution_space::scratch_memory_space & thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0, 1,0) ; }
+  const execution_space::scratch_memory_space & team_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+  const execution_space::scratch_memory_space & thread_scratch(const int& level) const
+    { return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
 
   int league_rank() const {return 0;}
   int league_size() const {return 1;}
@@ -245,6 +248,8 @@ public:
   CudaTeamMember( void * shared
                 , const int shared_begin
                 , const int shared_end
+                , void*     scratch_level_1_ptr
+                , const int scratch_level_1_size
                 , const int arg_league_rank
                 , const int arg_league_size );
 
@@ -272,8 +277,8 @@ private:
   int m_league_size ;
   int m_team_size ;
   int m_vector_length ;
-  int m_team_scratch_size ;
-  int m_thread_scratch_size ;
+  int m_team_scratch_size[2] ;
+  int m_thread_scratch_size[2] ;
   int m_chunk_size;
 
 public:
@@ -285,8 +290,10 @@ public:
     m_league_size = p.m_league_size;
     m_team_size = p.m_team_size;
     m_vector_length = p.m_vector_length;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
     m_chunk_size = p.m_chunk_size;
     return *this;
   }
@@ -332,14 +339,23 @@ public:
   inline int vector_length()   const { return m_vector_length ; }
   inline int team_size()   const { return m_team_size ; }
   inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size; }
+  inline int scratch_size(int level, int team_size_ = -1) const {
+    if(team_size_<0) team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level];
+  }
+  inline size_t team_scratch_size(int level) const {
+    return m_team_scratch_size[level];
+  }
+  inline size_t thread_scratch_size(int level) const {
+    return m_thread_scratch_size[level];
+  }
 
   TeamPolicyInternal()
     : m_league_size( 0 )
     , m_team_size( 0 )
     , m_vector_length( 0 )
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
     , m_chunk_size ( 32 ) 
    {}
 
@@ -351,8 +367,8 @@ public:
     : m_league_size( league_size_ )
     , m_team_size( team_size_request )
     , m_vector_length( vector_length_request )
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
     , m_chunk_size ( 32 )
     {
       // Allow only power-of-two vector_length
@@ -378,8 +394,8 @@ public:
     : m_league_size( league_size_ )
     , m_team_size( -1 )
     , m_vector_length( vector_length_request )
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
     , m_chunk_size ( 32 )
     {
       // Allow only power-of-two vector_length
@@ -398,8 +414,8 @@ public:
     : m_league_size( league_size_ )
     , m_team_size( team_size_request )
     , m_vector_length ( vector_length_request )
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
     , m_chunk_size ( 32 )
     {
       // Allow only power-of-two vector_length
@@ -423,8 +439,8 @@ public:
     : m_league_size( league_size_ )
     , m_team_size( -1 )
     , m_vector_length ( vector_length_request )
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size {0,0}
+    , m_thread_scratch_size {0,0}
     , m_chunk_size ( 32 )
     {
       // Allow only power-of-two vector_length
@@ -448,26 +464,23 @@ public:
 
   /** \brief set per team scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
     return p;
   };
 
   /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
   /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
@@ -580,6 +593,8 @@ private:
   const size_type   m_vector_size ;
   const size_type   m_shmem_begin ;
   const size_type   m_shmem_size ;
+  void*             m_scratch_ptr[2] ;
+  const int         m_scratch_size[2] ;
 
   template< class TagType >
   __device__ inline
@@ -605,6 +620,8 @@ public:
         typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
                                     , m_shmem_begin
                                     , m_shmem_size
+                                    , m_scratch_ptr[1]
+                                    , m_scratch_size[1]
                                     , league_rank
                                     , m_league_size ) );
     }
@@ -627,22 +644,24 @@ public:
     : m_functor( arg_functor )
     , m_league_size( arg_policy.league_size() )
     , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length() )
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
     , m_vector_size( arg_policy.vector_length() )
     , m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
+    , m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
+    , m_scratch_ptr{NULL,NULL}
+    , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
     {
       // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+      m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
 
       const int shmem_size_total = m_shmem_begin + m_shmem_size ;
-
       if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
         Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
       }
 
-      if ( m_team_size >
-           Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
-                 ( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length()) {
+      if ( int(m_team_size) >
+           int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
+                 ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
         Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
       }
     }
@@ -657,9 +676,10 @@ public:
 namespace Kokkos {
 namespace Impl {
 
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                     , Kokkos::Cuda 
                     >
 {
@@ -671,8 +691,12 @@ private:
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
 
 public:
 
@@ -686,11 +710,20 @@ public:
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
+  const ReducerType   m_reducer ;
   const pointer_type  m_result_ptr ;
   size_type *         m_scratch_space ;
   size_type *         m_scratch_flags ;
   size_type *         m_unified_space ;
 
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Some crutch to do function overloading
+private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
+public:
   template< class TagType >
   __device__ inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
@@ -703,17 +736,20 @@ public:
   exec_range( const Member & i , reference_type update ) const
     { m_functor( TagType() , i , update ); }
 
-#if ! defined( KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION )
+  __device__ inline
+  void operator() () const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
 
   __device__ inline
-  void operator()(void) const
+  void run(const DummySHMEMReductionType& ) const
   {
     const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
-      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
 
     {
       reference_type value =
-        ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
 
       // Number of blocks is bounded so that the reduction can be limited to two passes.
       // Each thread block is given an approximately equal amount of work to perform.
@@ -729,8 +765,8 @@ public:
     }
 
     // Reduce with final value at blockDim.y - 1 location.
-    if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>(
-           m_functor , blockIdx.x , gridDim.x ,
+    if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
            kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
 
       // This is the final block with the final result at the final threads' location
@@ -739,7 +775,7 @@ public:
       size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
 
       if ( threadIdx.y == 0 ) {
-        Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , shared );
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
       }
 
       if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
@@ -748,20 +784,18 @@ public:
     }
   }
 
-#else /* defined( KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION ) */
-
   __device__ inline
-   void operator()(void) const
+   void run(const DummyShflReductionType&) const
    {
 
-     value_type value = 0;
-
+     value_type value;
+     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
      // Number of blocks is bounded so that the reduction can be limited to two passes.
      // Each thread block is given an approximately equal amount of work to perform.
      // Accumulate the values for this block.
      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
 
-     const Policy range( m_policy , blockIdx.x , gridDim.x );
+     const WorkRange range( m_policy , blockIdx.x , gridDim.x );
 
      for ( Member iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
            iwork < iwork_end ; iwork += blockDim.y ) {
@@ -769,20 +803,23 @@ public:
      }
 
      pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
      int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y;
-     max_active_thread = max_active_thread == 0?blockDim.y:max_active_thread;
-     if(Impl::cuda_inter_block_reduction<FunctorType,Impl::JoinAdd<value_type> >
-            (value,Impl::JoinAdd<value_type>(),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+
+     max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
+
+    value_type init;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+     if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
+            (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
        const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
        if(id==0) {
-         Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , (void*) &value );
+         Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
          *result = value;
        }
      }
    }
 
-#endif
-
   // Determine block size constrained by shared memory:
   static inline
   unsigned local_block_size( const FunctorType & f )
@@ -799,20 +836,17 @@ public:
       if ( nwork ) {
         const int block_size = local_block_size( m_functor );
   
-        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * block_size /* block_size == max block_count */ );
+        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
         m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
-        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( m_functor ) );
+        m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
   
         // REQUIRED ( 1 , N , 1 )
         const dim3 block( 1 , block_size , 1 );
         // Required grid.x <= block.y
         const dim3 grid( std::min( int(block.y) , int( ( nwork + block.y - 1 ) / block.y ) ) , 1 , 1 );
   
-#ifdef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION
-      const int shmem = 0;
-#else
-      const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
-#endif
+      const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
+
   
       CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
   
@@ -820,18 +854,18 @@ public:
   
       if ( m_result_ptr ) {
         if ( m_unified_space ) {
-          const int count = ValueTraits::value_count( m_functor );
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
           for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
         }
         else {
-          const int size = ValueTraits::value_size( m_functor );
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
           DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
         }
       }
     }
     else {
       if (m_result_ptr) {
-        ValueInit::init( m_functor , m_result_ptr ); 
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
       }
     }
   }
@@ -840,21 +874,37 @@ public:
   ParallelReduce( const FunctorType  & arg_functor 
                 , const Policy       & arg_policy 
                 , const HostViewType & arg_result
-                )
+                , typename std::enable_if<
+                   Kokkos::is_view< HostViewType >::value
+                ,void*>::type = NULL)
   : m_functor( arg_functor )
   , m_policy(  arg_policy )
+  , m_reducer( InvalidType() )
   , m_result_ptr( arg_result.ptr_on_device() )
   , m_scratch_space( 0 )
   , m_scratch_flags( 0 )
   , m_unified_space( 0 )
   { }
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  { }
 };
 
 //----------------------------------------------------------------------------
 
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                     , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                     , Kokkos::Cuda
                     >
 {
@@ -864,18 +914,29 @@ private:
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+
 
 public:
 
   typedef FunctorType      functor_type ;
   typedef Cuda::size_type  size_type ;
 
+  enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
+
 private:
+  typedef double DummyShflReductionType;
+  typedef int DummySHMEMReductionType;
+
 
   // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
   // shared memory utilization:
@@ -886,6 +947,7 @@ private:
   //
 
   const FunctorType   m_functor ;
+  const ReducerType   m_reducer ;
   const pointer_type  m_result_ptr ;
   size_type *         m_scratch_space ;
   size_type *         m_scratch_flags ;
@@ -893,8 +955,11 @@ private:
   size_type           m_team_begin ;
   size_type           m_shmem_begin ;
   size_type           m_shmem_size ;
+  void*               m_scratch_ptr[2] ;
+  int                 m_scratch_size[2] ;
   const size_type     m_league_size ;
   const size_type     m_team_size ;
+  const size_type     m_vector_size ;
 
   template< class TagType >
   __device__ inline
@@ -911,13 +976,18 @@ private:
 public:
 
   __device__ inline
-  void operator()(void) const
+  void operator() () const {
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+  }
+
+  __device__ inline
+  void run(const DummySHMEMReductionType&) const
   {
     const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
-      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
 
     reference_type value =
-      ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
 
     // Iterate this block through the league
     for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
@@ -925,6 +995,8 @@ public:
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
                                         , m_shmem_size
+                                        , m_scratch_ptr[1]
+                                        , m_scratch_size[1]
                                         , league_rank
                                         , m_league_size )
         , value );
@@ -932,7 +1004,7 @@ public:
 
     // Reduce with final value at blockDim.y - 1 location.
     if ( cuda_single_inter_block_reduce_scan<false,FunctorType,WorkTag>(
-           m_functor , blockIdx.x , gridDim.x ,
+           ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
            kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
 
       // This is the final block with the final result at the final threads' location
@@ -941,7 +1013,7 @@ public:
       size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
 
       if ( threadIdx.y == 0 ) {
-        Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , shared );
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
       }
 
       if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
@@ -950,18 +1022,51 @@ public:
     }
   }
 
+  __device__ inline
+  void run(const DummyShflReductionType&) const
+  {
+    value_type value;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
+
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+      this-> template exec_team< WorkTag >
+        ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                                        , m_shmem_begin
+                                        , m_shmem_size
+                                        , m_scratch_ptr[1]
+                                        , m_scratch_size[1]
+                                        , league_rank
+                                        , m_league_size )
+        , value );
+    }
+
+    pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+
+    value_type init;
+    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
+    if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
+           (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
+      const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+      if(id==0) {
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
+        *result = value;
+      }
+    }
+  }
+
   inline
   void execute()
     {
-      const int block_count = std::min( m_league_size , m_team_size );
+      const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
+                                               :std::min( m_league_size , m_team_size );
 
-      m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( m_functor ) * block_count );
+      m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
       m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
-      m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( m_functor ) );
+      m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
 
-      // REQUIRED DIMENSIONS ( 1 , N , 1 )
-      const dim3 block( 1 , m_team_size , 1 );
-      const dim3 grid( std::min( int(m_league_size) , int(m_team_size) ) , 1 , 1 );
+      const dim3 block( m_vector_size , m_team_size , 1 );
+      const dim3 grid( block_count , 1 , 1 );
       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
 
       CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
@@ -970,11 +1075,11 @@ public:
 
       if ( m_result_ptr ) {
         if ( m_unified_space ) {
-          const int count = ValueTraits::value_count( m_functor );
+          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
           for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
         }
         else {
-          const int size = ValueTraits::value_size( m_functor );
+          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
           DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
         }
       }
@@ -984,8 +1089,11 @@ public:
   ParallelReduce( const FunctorType  & arg_functor 
                 , const Policy       & arg_policy 
                 , const HostViewType & arg_result
-                )
+                , typename std::enable_if<
+                                   Kokkos::is_view< HostViewType >::value
+                                ,void*>::type = NULL)
   : m_functor( arg_functor )
+  , m_reducer( InvalidType() )
   , m_result_ptr( arg_result.ptr_on_device() )
   , m_scratch_space( 0 )
   , m_scratch_flags( 0 )
@@ -993,39 +1101,107 @@ public:
   , m_team_begin( 0 )
   , m_shmem_begin( 0 )
   , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
   , m_league_size( arg_policy.league_size() )
   , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length() )
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+      arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
+  , m_scratch_size{arg_policy.scratch_size(0,m_team_size),arg_policy.scratch_size(1,m_team_size)}
   {
     // Return Init value if the number of worksets is zero
     if( arg_policy.league_size() == 0) {
-      ValueInit::init( m_functor , arg_result.ptr_on_device() );
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , arg_result.ptr_on_device() );
       return ;
     }
 
-    m_team_begin = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
+    m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
     m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
-    m_shmem_size = arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
 
     // The global parallel_reduce does not support vector_length other than 1 at the moment
-    if( arg_policy.vector_length() > 1)
-      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA.");
+    if( (arg_policy.vector_length() > 1) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
 
-    if( m_team_size < 32)
-      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA.");
+    if( (m_team_size < 32) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
 
     // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
 
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
 
-    if ( ! Kokkos::Impl::is_integral_power_of_two( m_team_size ) ||
-         CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+    if (! Kokkos::Impl::is_integral_power_of_two( m_team_size )  && !UseShflReduction ) {
       Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
 
+    if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
+    }
+
     if ( m_team_size >
          Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
-               ( arg_functor , arg_policy.vector_length(), arg_policy.scratch_size() ) / arg_policy.vector_length()) {
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
+    }
+
+  }
+
+  ParallelReduce( const FunctorType  & arg_functor
+                , const Policy       & arg_policy
+                , const ReducerType & reducer)
+  : m_functor( arg_functor )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().ptr_on_device() )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( 0 )
+  , m_shmem_begin( 0 )
+  , m_shmem_size( 0 )
+  , m_scratch_ptr{NULL,NULL}
+  , m_league_size( arg_policy.league_size() )
+  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
+      arg_policy.vector_length() )
+  , m_vector_size( arg_policy.vector_length() )
+  {
+    // Return Init value if the number of worksets is zero
+    if( arg_policy.league_size() == 0) {
+      ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
+      return ;
+    }
+
+    m_team_begin = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( arg_functor , m_team_size );
+    m_shmem_begin = sizeof(double) * ( m_team_size + 2 );
+    m_shmem_size = arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , m_team_size );
+    m_scratch_ptr[1] = cuda_resize_scratch_space(m_scratch_size[1]*(Cuda::concurrency()/(m_team_size*m_vector_size)));
+    m_scratch_size[0] = m_shmem_size;
+    m_scratch_size[1] = arg_policy.scratch_size(1,m_team_size);
+
+    // The global parallel_reduce does not support vector_length other than 1 at the moment
+    if( (arg_policy.vector_length() > 1) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA for dynamic sized reduction types.");
+
+    if( (m_team_size < 32) && !UseShflReduction )
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller than 32 is not currently supported with CUDA for dynamic sized reduction types.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+
+    if ( (! Kokkos::Impl::is_integral_power_of_two( m_team_size )  && !UseShflReduction ) ||
+         CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    if ( int(m_team_size) >
+         int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
       Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
     }
 
@@ -1453,14 +1629,12 @@ KOKKOS_INLINE_FUNCTION
 void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
       loop_boundaries, const Lambda & lambda, ValueType& result) {
 #ifdef __CUDA_ARCH__
-  ValueType val = ValueType();
+  result = ValueType();
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,val);
+    lambda(i,result);
   }
 
-  result = val;
-
   if (loop_boundaries.increment > 1)
     result += shfl_down(result, 1,loop_boundaries.increment);
   if (loop_boundaries.increment > 2)
@@ -1659,6 +1833,11 @@ namespace Impl {
       //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
       f(i,val);
     }
+    __device__ inline
+    void operator() (typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,val);
+    }
 
   };
 
@@ -1692,11 +1871,22 @@ namespace Impl {
     enum {value = true};
   };
 
+  template< class FunctorType, class Enable = void>
+    struct ReduceFunctorHasShmemSize {
+      enum {value = false};
+    };
+
+    template< class FunctorType>
+    struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > {
+      enum {value = true};
+    };
+
   template< class FunctorType, bool Enable =
       ( FunctorDeclaresValueType<FunctorType,void>::value) ||
       ( ReduceFunctorHasInit<FunctorType>::value  ) ||
       ( ReduceFunctorHasJoin<FunctorType>::value  ) ||
-      ( ReduceFunctorHasFinal<FunctorType>::value )
+      ( ReduceFunctorHasFinal<FunctorType>::value ) ||
+      ( ReduceFunctorHasShmemSize<FunctorType>::value )
       >
   struct IsNonTrivialReduceFunctor {
     enum {value = false};
@@ -1717,376 +1907,18 @@ namespace Impl {
     typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type;
   };
 
-}
-
-// general policy and view ouput
-template< class ExecPolicy , class FunctorTypeIn , class ViewType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorTypeIn & functor_in
-                    , const ViewType    & result_view
-                    , const std::string& str = "" 
-                    , typename Impl::enable_if<
-                      ( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value &&
-                        Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
-                      )>::type * = 0 )
-{
-  enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
-  typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType;
-  FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in));
-
-#if (KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
-     }
-#endif
-}
-
-// general policy and pod or array of pod output
-template< class ExecPolicy , class FunctorTypeIn , class ResultType>
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorTypeIn & functor_in
-                    , ResultType& result_ref
-                    , const std::string& str = "" 
-                    , typename Impl::enable_if<
-                      ( ! Kokkos::is_view<ResultType>::value &&
-                        ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value &&
-                        ! Impl::is_integral< ExecPolicy >::value  &&
-                          Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 )
-{
-  typedef typename Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result_ref )
-               , 1
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( FunctorType(functor_in) , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
-     }
-#endif
-}
-
-// general policy and pod or array of pod output
-template< class ExecPolicy , class FunctorType>
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type result_ref
-                    , const std::string& str = "" 
-                    , typename Impl::enable_if<
-                      (   Impl::IsNonTrivialReduceFunctor<FunctorType>::value &&
-                        ! Impl::is_integral< ExecPolicy >::value  &&
-                          Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 )
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result_ref )
-               , ValueTraits::value_count( functor )
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
-     }
-#endif
-}
-
-// integral range policy and view ouput
-template< class FunctorTypeIn , class ViewType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorTypeIn & functor_in
-                    , const ViewType    & result_view
-                    , const std::string& str = "" 
-                    , typename Impl::enable_if<( Kokkos::is_view<ViewType>::value &&
-                                                 Impl::is_same<
-                          typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space,
-                          Kokkos::Cuda>::value
-                        )>::type * = 0 )
-{
-  enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > ExecPolicy ;
-
-  typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType;
-
-  FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in));
-
-#if (KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
-     }
-#endif
-
-}
-
-// integral range policy and pod or array of pod output
-template< class FunctorTypeIn , class ResultType>
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorTypeIn & functor_in
-                    , ResultType& result
-                    , const std::string& str = "" 
-                    , typename Impl::enable_if< ! Kokkos::is_view<ResultType>::value &&
-                                                ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value &&
-                                                Impl::is_same<
-                             typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space,
-                             Kokkos::Cuda>::value >::type * = 0 )
-{
-  typedef typename
-    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space
-      execution_space ;
-  typedef Kokkos::RangePolicy< execution_space > ExecPolicy ;
-
-  typedef Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType;
-
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
-
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result )
-               , 1
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType , ExecPolicy > closure( FunctorType(functor_in) , ExecPolicy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
-     }
-#endif
-}
-
-template< class FunctorType>
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , typename Kokkos::Impl::FunctorValueTraits< FunctorType , void >::reference_type result
-                    , const std::string& str = "" 
-                    , typename Impl::enable_if< Impl::IsNonTrivialReduceFunctor<FunctorType>::value &&
-                                                Impl::is_same<
-                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
-                             Kokkos::Cuda>::value >::type * = 0 )
-{
-
-  typedef typename
-    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-  typedef Kokkos::RangePolicy< execution_space > ExecPolicy ;
-
-
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
-
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result )
-               , ValueTraits::value_count( functor )
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-    uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
-     }
-#endif
-}
-
-#ifdef KOKKOS_HAVE_CUDA
-template< class ExecPolicy , class FunctorType , class ResultType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , ResultType * result)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,result,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-
-template< class ExecPolicy , class FunctorType , class ResultType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , ResultType & result)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
+  template< class FunctorTypeIn, class ExecPolicy, class ValueType>
+  struct ParallelReduceFunctorType<FunctorTypeIn,ExecPolicy,ValueType,Cuda> {
 
-  parallel_reduce(policy,functor,result,str);
+    enum {FunctorHasValueType = IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
+    typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ValueType> >::type functor_type;
+    static functor_type functor(const FunctorTypeIn& functor_in) {
+      return Impl::if_c<FunctorHasValueType,FunctorTypeIn,functor_type>::select(functor_in,functor_type(functor_in));
+    }
+  };
 
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
 }
 
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-#endif
 } // namespace Kokkos
 #endif /* defined( __CUDACC__ ) */
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index 11871a6abc..1778f631c0 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value,
   cuda_inter_warp_reduction(value,join,max_active_thread);
 }
 
-template< class FunctorType , class JoinOp>
+template< class FunctorType , class JoinOp , class ArgTag = void >
 __device__
-bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type  value,
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  value,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::reference_type  neutral,
                                  const JoinOp& join,
                                  Cuda::size_type * const m_scratch_space,
-                                 typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
+                                 typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
                                  Cuda::size_type * const m_scratch_flags,
                                  const int max_active_thread = blockDim.y) {
-  typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
-  typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
 
   //Do the intra-block reduction with shfl operations and static shared memory
   cuda_intra_block_reduction(value,join,max_active_thread);
@@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void
       if(id == 0)
         *m_scratch_flags = 0;
       last_block = true;
-      value = 0;
+      value = neutral;
 
       pointer_type const volatile global = (pointer_type) m_scratch_space ;
 
@@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
     size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
     size_type * const global = global_data + word_count.value * block_id ;
 
+#if (__CUDA_ARCH__ < 500)
     for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
+#else
+    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
+#endif
+
   }
 
   // Contributing blocks note that their contribution has been completed via an atomic-increment flag
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
new file mode 100644
index 0000000000..701d267e1b
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -0,0 +1,179 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+
+__device__
+void TaskQueueSpecialization< Kokkos::Cuda >::driver
+  ( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
+{
+  using Member = TaskExec< Kokkos::Cuda > ;
+  using Queue  = TaskQueue< Kokkos::Cuda > ;
+  using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member single_exec( 1 );
+  Member team_exec( blockDim.y );
+
+  const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
+
+  union {
+    task_root_type * ptr ;
+    int              raw[2] ;
+  } task ;
+
+  // Loop until all queues are empty and no tasks in flight
+
+  do {
+
+    // Each team lead attempts to acquire either a thread team task
+    // or collection of single thread tasks for the team.
+
+    if ( 0 == warp_lane ) {
+
+      task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
+          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+#if 0
+printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
+      , uintptr_t(task.ptr));
+#endif
+
+    }
+
+    // shuffle broadcast
+
+    task.raw[0] = __shfl( task.raw[0] , 0 );
+    task.raw[1] = __shfl( task.raw[1] , 0 );
+
+    if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
+
+    if ( end != task.ptr ) {
+      if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
+        // Thread Team Task
+        (*task.ptr->m_apply)( task.ptr , & team_exec );
+      }
+      else if ( 0 == threadIdx.y ) {
+        // Single Thread Task
+        (*task.ptr->m_apply)( task.ptr , & single_exec );
+      }
+
+      if ( 0 == warp_lane ) {
+        queue->complete( task.ptr ); 
+      }
+    }
+  } while(1);
+}
+
+namespace {
+
+__global__
+void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
+{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
+
+}
+
+void TaskQueueSpecialization< Kokkos::Cuda >::execute
+  ( TaskQueue< Kokkos::Cuda > * const queue )
+{
+  const int warps_per_block = 4 ;
+  const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
+  const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
+  const int shared = 0 ;
+  const cudaStream_t stream = 0 ;
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute before\n");
+#endif
+
+  // Query the stack size, in bytes:
+  //
+  // size_t stack_size = 0 ;
+  // CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
+  //
+  // If not large enough then set the stack size, in bytes:
+  //
+  // CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
+ 
+  cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
+
+  CUDA_SAFE_CALL( cudaGetLastError() );
+
+  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+#if 0
+printf("cuda_task_queue_execute after\n");
+#endif
+
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
new file mode 100644
index 0000000000..9d9347cc8d
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -0,0 +1,519 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
+#define KOKKOS_IMPL_CUDA_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+template< typename TaskType >
+__global__
+void set_cuda_task_base_apply_function_pointer
+  ( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
+{ *ptr = TaskType::apply ; }
+
+}
+
+template<>
+class TaskQueueSpecialization< Kokkos::Cuda >
+{
+public:
+
+  using execution_space = Kokkos::Cuda ;
+  using memory_space    = Kokkos::CudaUVMSpace ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const ) {}
+
+  __device__
+  static void driver( queue_type * const );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename FunctorType >
+  static
+  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space
+                               , typename FunctorType::value_type
+                               , FunctorType > ;
+
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
+
+      CUDA_SAFE_CALL( cudaGetLastError() );
+      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+    }
+};
+
+extern template class TaskQueue< Kokkos::Cuda > ;
+
+//----------------------------------------------------------------------------
+/**\brief  Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
+ *         passed to tasks running in a Cuda space.
+ *
+ *  Cuda thread blocks for tasking are dimensioned:
+ *    blockDim.x == vector length
+ *    blockDim.y == team size
+ *    blockDim.z == number of teams
+ *  where
+ *    blockDim.x * blockDim.y == WarpSize
+ *
+ *  Both single thread and thread team tasks are run by a full Cuda warp.
+ *  A single thread task is called by warp lane #0 and the remaining
+ *  lanes of the warp are idle.
+ */
+template<>
+class TaskExec< Kokkos::Cuda >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
+
+  const int m_team_size ;
+
+  __device__
+  TaskExec( int arg_team_size = blockDim.y )
+    : m_team_size( arg_team_size ) {}
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+  __device__ void team_barrier() { /* __threadfence_block(); */ }
+  __device__ int  team_rank() const { return threadIdx.y ; }
+  __device__ int  team_size() const { return m_team_size ; }
+#else
+  __host__ void team_barrier() {}
+  __host__ int  team_rank() const { return 0 ; }
+  __host__ int  team_size() const { return 0 ; }
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.y )
+    , end(arg_count)
+    , increment( blockDim.y )
+    , thread(arg_thread)
+    {}
+
+  __device__ inline
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    )
+    : start( arg_start + threadIdx.y )
+    , end(   arg_end)
+    , increment( blockDim.y )
+    , thread( arg_thread )
+    {}
+
+#else
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+  TeamThreadRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread
+    , const iType & arg_start
+    , const iType & arg_end
+    );
+
+#endif
+
+};
+
+//----------------------------------------------------------------------------
+
+template<typename iType>
+struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  const iType increment ;
+  const TaskExec< Kokkos::Cuda > & thread;
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
+    : start( threadIdx.x )
+    , end(arg_count)
+    , increment( blockDim.x )
+    , thread(arg_thread)
+    {}
+
+#else
+
+  ThreadVectorRangeBoundariesStruct
+    ( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
+
+#endif
+
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
+ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
+               , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+// reduce across corresponding lanes between team members within warp
+// assume stride*team_size == warp_size
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void strided_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int team_size,
+   int stride)
+{
+  for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
+  }
+}
+
+// multiple within-warp non-strided reductions
+template< typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void multi_shfl_warp_reduction
+  (const JoinType& join,
+   ValueType& val,
+   int vec_length)
+{
+  for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
+    join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
+  }
+}
+
+// broadcast within warp
+template< class ValueType >
+KOKKOS_INLINE_FUNCTION
+ValueType shfl_warp_broadcast
+  (ValueType& val,
+   int src_lane,
+   int width)
+{
+  return Kokkos::shfl(val, src_lane, width);
+}
+
+// all-reduce across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction<ValueType, JoinType>(
+                          join,
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
+
+// all-reduce across corresponding vector lanes between team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  //TODO what is the point of creating this temporary?
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  strided_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          loop_boundaries.thread.team_size(),
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
+}
+
+// all-reduce within team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType& join,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+  initialized_result = result;
+
+  multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}
+
+// all-reduce within team members within warp
+// if no join() provided, use sum
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result) {
+
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  initialized_result = result;
+
+  //initialized_result = multi_shfl_warp_reduction(
+  multi_shfl_warp_reduction(
+                          [&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
+                          initialized_result,
+                          blockDim.x);
+  initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
+}
+
+// scan across corresponding vector lanes between team members within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda) {
+
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-blockDim.y exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
+      y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
+      if(threadIdx.y*blockDim.x >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_warp_broadcast<ValueType>(val,
+                                            threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
+                                            Impl::CudaTraits::WarpSize);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
+    if ( threadIdx.y == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// scan within team member (vector) within warp
+// assume vec_length*team_size == warp_size 
+// blockDim.x == vec_length == stride
+// blockDim.y == team_size
+// threadIdx.x == position in vec
+// threadIdx.y == member number
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, y, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    val = 0;
+    lambda(i,val,false);
+
+    // intra-blockDim.x exclusive scan on 'val'
+    // accum = accumulated, sum in total for this iteration
+
+    // INCLUSIVE scan
+    for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
+      y = Kokkos::shfl_up(val, offset, blockDim.x);
+      if(threadIdx.x >= offset) { val += y; }
+    }
+
+    // pass accum to all threads
+    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
+
+    // make EXCLUSIVE scan by shifting values over one
+    val = Kokkos::shfl_up(val, 1, blockDim.x);
+    if ( threadIdx.x == 0 ) { val = 0 ; }
+
+    val += accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
index f470a0a6ef..bb3cd2640d 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
@@ -46,9 +46,10 @@
 #include <stdio.h>
 #include <iostream>
 #include <sstream>
+#include <Kokkos_Core.hpp>
 #include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
 
-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 
 // #define DETAILED_PRINT
 
@@ -93,9 +94,8 @@ CudaTaskPolicyQueue
   , const unsigned arg_team_size
   )
   : m_space( Kokkos::CudaUVMSpace()
-           , arg_task_max_size
-           , arg_task_max_size * arg_task_max_count
-           , 1 /* only one level of memory pool */
+           , arg_task_max_size * arg_task_max_count * 1.2
+           , 16 /* log2(superblock size) */
            )
   , m_team { 0 , 0 , 0 }
   , m_serial { 0 , 0 , 0 }
@@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) {
           member( kokkos_impl_cuda_shared_memory<void>()
                 , 16                      /* shared_begin */
                 , team_task->m_shmem_size /* shared size */
+                , 0                       /* scratch level 1 pointer */
+                , 0                       /* scratch level 1 size */
                 , 0                       /* league rank */
                 , 1                       /* league size */
                 );
@@ -926,5 +928,5 @@ void Task::clear_dependence()
 } /* namespace Kokkos */
 
 
-#endif  /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
+#endif  /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
index 1b645c8819..e71512f039 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_TaskPolicy.hpp
@@ -47,19 +47,11 @@
 #define KOKKOS_CUDA_TASKPOLICY_HPP
 
 #include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_HAVE_CUDA ) && \
-    defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
-
-#define KOKKOS_ENABLE_CUDA_TASK_POLICY
-
-/* The TaskPolicy< Cuda > capability requires nvcc using the option:
- *    --relocatable-device-code=true
- */
-
 #include <Kokkos_Cuda.hpp>
 #include <Kokkos_TaskPolicy.hpp>
 
+#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -81,8 +73,6 @@ public:
 
 private:
 
-  friend struct CudaTaskPolicyQueue ;
-
   CudaTaskPolicyQueue   * m_policy ;
   TaskMember * volatile * m_queue ;
   function_team_type      m_team ;    ///< Apply function on CUDA
@@ -819,9 +809,11 @@ public:
   static member_type member_single()
     {
       return
-        member_type( 0 /* shared memory */
-                   , 0 /* shared memory begin */
-                   , 0 /* shared memory size */
+        member_type( 0 /* shared memory pointer */
+                   , 0 /* shared memory begin offset */
+                   , 0 /* shared memory end offset */
+                   , 0 /* scratch level_1 pointer */
+                   , 0 /* scratch level_1 size */
                    , 0 /* league rank */
                    , 1 /* league size */ );
     }
@@ -832,10 +824,10 @@ public:
 } /* namespace Experimental */
 } /* namespace Kokkos */
 
-#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */
 
 //----------------------------------------------------------------------------
 
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */
 
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
index 84c2e75dc2..92f6fc1f5f 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -56,8 +56,6 @@
 #include <impl/Kokkos_Shape.hpp>
 #include <Kokkos_View.hpp>
 
-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace >
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
-// Via reinterpret_case this can be used to support all scalar types of those sizes.
-// Any other scalar type falls back to either normal reads out of global memory,
-// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
-
-template< typename ValueType
-        , class MemorySpace
-        , class AliasType =
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  4 ) , int ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  8 ) , ::int2 ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 ,
-            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void
-            >::type
-            >::type
-            >::type
-            >::type
-        >
-class CudaTextureFetch {
-private:
-
-  cuda_texture_object_type  m_obj ;
-  const ValueType         * m_alloc_ptr ;
-  int                       m_offset ;
-
-  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-  {
-    typedef char const * const byte;
-
-    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
-
-    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
-    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
-
-    const size_t count = tracker.alloc_size() / sizeof(ValueType);
-    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
-
-    if (ok_aligned && ok_contains) {
-      if (tracker.attribute() == NULL ) {
-        MemorySpace::texture_object_attach(
-            tracker
-            , sizeof(ValueType)
-            , cudaCreateChannelDesc< AliasType >()
-            );
-      }
-      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
-      m_offset = arg_ptr - m_alloc_ptr;
-    }
-    else if( !ok_contains ) {
-      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
-    }
-    else {
-      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
-    }
-  }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : m_obj(       rhs.m_obj )
-    , m_alloc_ptr( rhs.m_alloc_ptr )
-    , m_offset(    rhs.m_offset )
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    {
-      m_obj       = rhs.m_obj ;
-      m_alloc_ptr = rhs.m_alloc_ptr ;
-      m_offset    = rhs.m_offset ;
-      return *this ;
-    }
-
-  KOKKOS_INLINE_FUNCTION explicit
-  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC )
-        m_alloc_ptr(arg_ptr);
-      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-        if ( arg_ptr != NULL ) {
-          if ( tracker.is_valid() ) {
-            attach( arg_ptr, tracker );
-          }
-          else {
-            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
-            if ( found_tracker.is_valid() ) {
-              attach( arg_ptr, found_tracker );
-            } else {
-              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
-            }
-          }
-        }
-      #endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
-        return  *(reinterpret_cast<ValueType*> (&v));
-      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
-        return  *(reinterpret_cast<ValueType*> (&v));
-      #else
-        return m_alloc_ptr[ i + m_offset ];
-      #endif
-  }
-};
-
-
-template< typename ValueType, class MemorySpace >
-class CudaTextureFetch< const ValueType, MemorySpace, float4 > {
-private:
-  typedef float4 AliasType;
-  cuda_texture_object_type  m_obj ;
-  const ValueType         * m_alloc_ptr ;
-  int                       m_offset ;
-
-  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-  {
-    typedef char const * const byte;
-
-    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
-
-    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
-    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
-
-    const size_t count = tracker.alloc_size() / sizeof(ValueType);
-    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
-
-    if (ok_aligned && ok_contains) {
-      if (tracker.attribute() == NULL ) {
-        MemorySpace::texture_object_attach(
-            tracker
-            , sizeof(ValueType)
-            , cudaCreateChannelDesc< AliasType >()
-            );
-      }
-      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
-      m_offset = arg_ptr - m_alloc_ptr;
-    }
-    else if( !ok_contains ) {
-      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
-    }
-    else {
-      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
-    }
-  }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : m_obj(       rhs.m_obj )
-    , m_alloc_ptr( rhs.m_alloc_ptr )
-    , m_offset(    rhs.m_offset )
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    {
-      m_obj       = rhs.m_obj ;
-      m_alloc_ptr = rhs.m_alloc_ptr ;
-      m_offset    = rhs.m_offset ;
-      return *this ;
-    }
-
-  KOKKOS_INLINE_FUNCTION explicit
-  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
-    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC )
-        m_alloc_ptr(arg_ptr);
-      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-        if ( arg_ptr != NULL ) {
-          if ( tracker.is_valid() ) {
-            attach( arg_ptr, tracker );
-          }
-          else {
-            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
-            if ( found_tracker.is_valid() ) {
-              attach( arg_ptr, found_tracker );
-            } else {
-              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
-            }
-          }
-        }
-      #endif
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-    {
-      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
-        return  *(reinterpret_cast<ValueType*> (&v));
-      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-        union Float4ValueType {
-          float4 f4[2];
-          ValueType val;
-        };
-        Float4ValueType convert;
-        convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) );
-        convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 );
-        return  convert.val;
-      #else
-        return m_alloc_ptr[ i + m_offset ];
-      #endif
-  }
-};
-
-template< typename ValueType, class MemorySpace >
-class CudaTextureFetch< const ValueType, MemorySpace, void >
-{
-private:
-  const ValueType * m_ptr ;
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : m_ptr(0) {};
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
-    m_ptr = rhs.m_ptr;
-    return *this ;
-  }
-
-  explicit KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
-    m_ptr = base_view_ptr;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
-    m_ptr = base_view_ptr;
-    return *this;
-  }
-
-
-  KOKKOS_INLINE_FUNCTION
-  operator const ValueType * () const { return m_ptr ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-  {
-    return m_ptr[ i ];
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
- *          if 'const' value type, CudaSpace and random access.
- */
-template< class ViewTraits >
-class ViewDataHandle< ViewTraits ,
-  typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
-                        is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
-                      &&
-                      is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
-                      &&
-                      ViewTraits::memory_traits::RandomAccess
-                    >::type >
-{
-public:
-  enum { ReturnTypeIsReference = false };
-
-  typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
-                                , typename ViewTraits::memory_space> handle_type;
-
-  KOKKOS_INLINE_FUNCTION
-  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
-  {
-    return handle_type(arg_data_ptr, arg_tracker);
-  }
-
-  typedef typename ViewTraits::value_type return_type;
-};
-
-}
-}
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif // KOKKOS_HAVE_CUDA
 #endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
 
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
new file mode 100644
index 0000000000..e813285fc7
--- /dev/null
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -0,0 +1,611 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <initializer_list>
+
+#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#define KOKKOS_MDRANGE_IVDEP
+#endif
+
+namespace Kokkos { namespace Experimental {
+
+enum class Iterate
+{
+  Default, // Default for the device
+  Left,    // Left indices stride fastest
+  Right,   // Right indices stride fastest
+  Flat,    // Do not tile, only valid for inner direction
+};
+
+template <typename ExecSpace>
+struct default_outer_direction
+{
+  using type = Iterate;
+  static constexpr Iterate value = Iterate::Right;
+};
+
+template <typename ExecSpace>
+struct default_inner_direction
+{
+  using type = Iterate;
+  static constexpr Iterate value = Iterate::Right;
+};
+
+
+// Iteration Pattern
+template < unsigned N
+         , Iterate OuterDir = Iterate::Default
+         , Iterate InnerDir = Iterate::Default
+         >
+struct Rank
+{
+  static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+
+
+// multi-dimensional iteration pattern
+template <typename... Properties>
+struct MDRangePolicy
+{
+  using range_policy = RangePolicy<Properties...>;
+
+  static_assert( !std::is_same<range_policy,void>::value
+               , "Kokkos Error: MD iteration pattern not defined" );
+
+  using iteration_pattern   = typename range_policy::iteration_pattern;
+  using work_tag            = typename range_policy::work_tag;
+
+  static constexpr int rank = iteration_pattern::rank;
+
+  static constexpr int outer_direction = static_cast<int> (
+      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
+    ? iteration_pattern::outer_direction
+    : default_outer_direction< typename range_policy::execution_space>::value );
+
+  static constexpr int inner_direction = static_cast<int> (
+      iteration_pattern::inner_direction != Iterate::Default
+    ? iteration_pattern::inner_direction
+    : default_inner_direction< typename range_policy::execution_space>::value ) ;
+
+
+  // Ugly ugly workaround intel 14 not handling scoped enum correctly
+  static constexpr int Flat = static_cast<int>( Iterate::Flat );
+  static constexpr int Right = static_cast<int>( Iterate::Right );
+
+
+  using size_type   = typename range_policy::index_type;
+  using index_type  = typename std::make_signed<size_type>::type;
+
+
+  template <typename I>
+  MDRangePolicy( std::initializer_list<I> upper_corner )
+  {
+    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+
+    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
+
+    const auto u = upper_corner.begin();
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(0);
+      m_dim[i]    = static_cast<index_type>(u[i]);
+      if (inner_direction != Flat) {
+        // default tile size to 4
+        m_tile[i] = 4;
+      } else {
+        m_tile[i] = 1;
+      }
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  template <typename IA, typename IB>
+  MDRangePolicy( std::initializer_list<IA> corner_a
+               , std::initializer_list<IB> corner_b
+               )
+  {
+    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
+    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
+    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+
+
+    using A = typename std::make_signed<IA>::type;
+    using B = typename std::make_signed<IB>::type;
+
+    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
+    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
+      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
+      if (inner_direction != Flat) {
+        // default tile size to 4
+        m_tile[i] = 4;
+      } else {
+        m_tile[i] = 1;
+      }
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  template <typename IA, typename IB, typename T>
+  MDRangePolicy( std::initializer_list<IA> corner_a
+               , std::initializer_list<IB> corner_b
+               , std::initializer_list<T> tile
+               )
+  {
+    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
+    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
+    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
+    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
+
+    // TODO check size of lists equal to rank
+    // static_asserts on initializer_list.size() require c++14
+    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
+    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
+    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
+
+    using A = typename std::make_signed<IA>::type;
+    using B = typename std::make_signed<IB>::type;
+
+    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
+    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
+    const auto t = tile.begin();
+
+    m_num_tiles = 1;
+    for (int i=0; i<rank; ++i) {
+      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
+      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
+      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
+      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
+      m_num_tiles *= m_tile_dim[i];
+    }
+  }
+
+  index_type   m_offset[rank];
+  index_type   m_dim[rank];
+  int          m_tile[rank];
+  index_type   m_tile_dim[rank];
+  size_type    m_num_tiles;       // product of tile dims
+};
+
+namespace Impl {
+
+// Serial, Threads, OpenMP
+// use enable_if to overload for Cuda
+template < typename MDRange, typename Functor, typename Enable = void >
+struct MDForFunctor
+{
+  using work_tag   = typename MDRange::work_tag;
+  using index_type = typename MDRange::index_type;
+  using size_type  = typename MDRange::size_type;
+
+  MDRange m_range;
+  Functor m_func;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange const& range, Functor const& f )
+    : m_range(range)
+    , m_func( f )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange const& range, Functor && f )
+    : m_range(range)
+    , m_func( std::forward<Functor>(f) )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange && range, Functor const& f )
+    : m_range( std::forward<MDRange>(range) )
+    , m_func( f )
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDRange && range, Functor && f )
+    : m_range( std::forward<MDRange>(range) )
+    , m_func( std::forward<Functor>(f) )
+  {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDForFunctor const& ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor& operator=( MDForFunctor const& ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor( MDForFunctor && ) = default;
+
+  KOKKOS_INLINE_FUNCTION
+  MDForFunctor& operator=( MDForFunctor && ) = default;
+
+  // Rank-2, Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
+            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
+    } else {
+      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+    }
+  }
+
+  // Rank-2, Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
+            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
+    } else {
+      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+    }
+  }
+
+  // Rank-2, Not Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    index_type t0, t1;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      t0 = t / m_range.m_tile_dim[1];
+      t1 = t % m_range.m_tile_dim[1];
+    } else {
+      t0 = t % m_range.m_tile_dim[0];
+      t1 = t / m_range.m_tile_dim[0];
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i1=b1; i1<e1; ++i1) {
+        m_func( i0, i1 );
+      }}
+    } else {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( i0, i1 );
+      }}
+    }
+  }
+
+  // Rank-2, Not Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 2
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    work_tag tag;
+
+    index_type t0, t1;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      t0 = t / m_range.m_tile_dim[1];
+      t1 = t % m_range.m_tile_dim[1];
+    } else {
+      t0 = t % m_range.m_tile_dim[0];
+      t1 = t / m_range.m_tile_dim[0];
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i1=b1; i1<e1; ++i1) {
+        m_func( tag, i0, i1 );
+      }}
+    } else {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( tag, i0, i1 );
+      }}
+    }
+  }
+
+  //---------------------------------------------------------------------------
+
+  // Rank-3, Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
+    m_func( m_range.m_offset[0] + (  t / tmp_prod )
+          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
+          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
+          );
+    } else {
+    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
+    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
+          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
+          , m_range.m_offset[2] + (  t / tmp_prod )
+          );
+    }
+  }
+
+  // Rank-3, Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction == MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
+      m_func( work_tag{}
+            , m_range.m_offset[0] + (  t / tmp_prod )
+            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
+            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
+            );
+    } else {
+      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
+      m_func( work_tag{}
+            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
+            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
+            , m_range.m_offset[2] + (  t / tmp_prod )
+            );
+    }
+  }
+
+  // Rank-3, Not Flat, No Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    index_type t0, t1, t2;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
+      t0 = t / tmp_prod;
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
+      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
+    } else {
+      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
+      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
+      t2 = t / tmp_prod;
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i2=b2; i2<e2; ++i2) {
+        m_func( i0, i1, i2 );
+      }}}
+    } else {
+      for (int i2=b2; i2<e2; ++i2) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( i0, i1, i2 );
+      }}}
+    }
+  }
+
+  // Rank-3, Not Flat, Tag
+  template <typename Idx>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<(  std::is_integral<Idx>::value
+                          && !std::is_same<void, work_tag>::value
+                          && MDRange::rank == 3
+                          && MDRange::inner_direction != MDRange::Flat
+                          )>::type
+  operator()(Idx t) const
+  {
+    work_tag tag;
+
+    index_type t0, t1, t2;
+    if (  MDRange::outer_direction == MDRange::Right ) {
+      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
+      t0 = t / tmp_prod;
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
+      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
+    } else {
+      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
+      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
+      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
+      t2 = t / tmp_prod;
+    }
+
+    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
+    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
+    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
+
+    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
+    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
+    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
+
+    if (  MDRange::inner_direction == MDRange::Right ) {
+      for (int i0=b0; i0<e0; ++i0) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i2=b2; i2<e2; ++i2) {
+        m_func( tag, i0, i1, i2 );
+      }}}
+    } else {
+      for (int i2=b2; i2<e2; ++i2) {
+      for (int i1=b1; i1<e1; ++i1) {
+      #if defined(KOKKOS_MDRANGE_IVDEP)
+      #pragma ivdep
+      #endif
+      for (int i0=b0; i0<e0; ++i0) {
+        m_func( tag, i0, i1, i2 );
+      }}}
+    }
+  }
+};
+
+
+
+} // namespace Impl
+
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    )
+{
+  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+
+  using range_policy = typename MDRange::range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    )
+{
+  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+
+  using range_policy = typename MDRange::range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
+
+}} // namespace Kokkos::Experimental
+
+#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
+
diff --git a/lib/kokkos/core/src/KokkosExp_View.hpp b/lib/kokkos/core/src/KokkosExp_View.hpp
deleted file mode 100644
index f62d318f2e..0000000000
--- a/lib/kokkos/core/src/KokkosExp_View.hpp
+++ /dev/null
@@ -1,2306 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_EXP_VIEW_HPP
-#define KOKKOS_EXP_VIEW_HPP
-
-#include <string>
-#include <algorithm>
-#include <type_traits>
-#include <initializer_list>
-
-#include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <Kokkos_ExecPolicy.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-template< class DstMemorySpace , class SrcMemorySpace >
-struct DeepCopy ;
-
-template< class DataType >
-struct ViewArrayAnalysis ;
-
-template< class DataType , class ArrayLayout 
-        , typename ValueType =
-          typename ViewArrayAnalysis< DataType >::non_const_value_type
-        >
-struct ViewDataAnalysis ;
-
-template< class , class ... >
-class ViewMapping { public: enum { is_assignable = false }; };
-
-template< class MemorySpace >
-struct ViewOperatorBoundsErrorAbort ;
-
-template<>
-struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > {
-  static void apply( const size_t rank
-                   , const size_t n0 , const size_t n1 
-                   , const size_t n2 , const size_t n3 
-                   , const size_t n4 , const size_t n5 
-                   , const size_t n6 , const size_t n7 
-                   , const size_t i0 , const size_t i1 
-                   , const size_t i2 , const size_t i3 
-                   , const size_t i4 , const size_t i5 
-                   , const size_t i6 , const size_t i7 );
-};
-
-} /* namespace Impl */
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-/** \class ViewTraits
- *  \brief Traits class for accessing attributes of a View.
- *
- * This is an implementation detail of View.  It is only of interest
- * to developers implementing a new specialization of View.
- *
- * Template argument options:
- *   - View< DataType >
- *   - View< DataType , Space >
- *   - View< DataType , Space , MemoryTraits >
- *   - View< DataType , ArrayLayout >
- *   - View< DataType , ArrayLayout , Space >
- *   - View< DataType , ArrayLayout , MemoryTraits >
- *   - View< DataType , ArrayLayout , Space , MemoryTraits >
- *   - View< DataType , MemoryTraits >
- */
-
-template< class DataType , class ... Properties >
-struct ViewTraits ;
-
-template<>
-struct ViewTraits< void >
-{
-  typedef void  execution_space ;
-  typedef void  memory_space ;
-  typedef void  HostMirrorSpace ;
-  typedef void  array_layout ;
-  typedef void  memory_traits ;
-};
-
-template< class ... Prop >
-struct ViewTraits< void , void , Prop ... >
-{
-  // Ignore an extraneous 'void'
-  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
-  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
-  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
-  typedef typename ViewTraits<void,Prop...>::array_layout     array_layout ;
-  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
-};
-
-template< class ArrayLayout , class ... Prop >
-struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayLayout>::value >::type , ArrayLayout , Prop ... >
-{
-  // Specify layout, keep subsequent space and memory traits arguments
-
-  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
-  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
-  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
-  typedef          ArrayLayout                                array_layout ;
-  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
-};
-
-template< class Space , class ... Prop >
-struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value >::type , Space , Prop ... >
-{
-  // Specify Space, memory traits should be the only subsequent argument.
-
-  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value ||
-                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value ||
-                 std::is_same< typename ViewTraits<void,Prop...>::HostMirrorSpace , void >::value ||
-                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value
-               , "Only one View Execution or Memory Space template argument" );
-
-  typedef typename Space::execution_space                   execution_space ;
-  typedef typename Space::memory_space                      memory_space ;
-  typedef typename Kokkos::Impl::is_space< Space >::host_mirror_space
-      HostMirrorSpace ;
-  typedef typename execution_space::array_layout            array_layout ;
-  typedef typename ViewTraits<void,Prop...>::memory_traits  memory_traits ;
-};
-
-template< class MemoryTraits , class ... Prop >
-struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<MemoryTraits>::value >::type , MemoryTraits , Prop ... >
-{
-  // Specify memory trait, should not be any subsequent arguments
-
-  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value ||
-                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value ||
-                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value ||
-                 std::is_same< typename ViewTraits<void,Prop...>::memory_traits   , void >::value
-               , "MemoryTrait is the final optional template argument for a View" );
-
-  typedef void          execution_space ;
-  typedef void          memory_space ;
-  typedef void          HostMirrorSpace ;
-  typedef void          array_layout ;
-  typedef MemoryTraits  memory_traits ;
-};
-
-
-template< class DataType , class ... Properties >
-struct ViewTraits {
-private:
-
-  // Unpack the properties arguments
-  typedef ViewTraits< void , Properties ... >  prop ;
-
-  typedef typename
-    std::conditional< ! std::is_same< typename prop::execution_space , void >::value
-                    , typename prop::execution_space
-                    , Kokkos::DefaultExecutionSpace
-                    >::type
-      ExecutionSpace ;
-
-  typedef typename
-    std::conditional< ! std::is_same< typename prop::memory_space , void >::value
-                    , typename prop::memory_space
-                    , typename ExecutionSpace::memory_space
-                    >::type
-      MemorySpace ;
-
-  typedef typename
-    std::conditional< ! std::is_same< typename prop::array_layout , void >::value
-                    , typename prop::array_layout
-                    , typename ExecutionSpace::array_layout
-                    >::type
-      ArrayLayout ;
-
-  typedef typename
-    std::conditional
-      < ! std::is_same< typename prop::HostMirrorSpace , void >::value
-      , typename prop::HostMirrorSpace
-      , typename Kokkos::Impl::is_space< ExecutionSpace >::host_mirror_space
-      >::type
-      HostMirrorSpace ;
-
-  typedef typename
-    std::conditional< ! std::is_same< typename prop::memory_traits , void >::value
-                    , typename prop::memory_traits
-                    , typename Kokkos::MemoryManaged
-                    >::type
-      MemoryTraits ;
-
-  // Analyze data type's properties,
-  // May be specialized based upon the layout and value type
-  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ;
-
-public:
-
-  //------------------------------------
-  // Data type traits:
-
-  typedef typename data_analysis::type            data_type ;
-  typedef typename data_analysis::const_type      const_data_type ;
-  typedef typename data_analysis::non_const_type  non_const_data_type ;
-
-  //------------------------------------
-  // Compatible array of trivial type traits:
-
-  typedef typename data_analysis::scalar_array_type            scalar_array_type ;
-  typedef typename data_analysis::const_scalar_array_type      const_scalar_array_type ;
-  typedef typename data_analysis::non_const_scalar_array_type  non_const_scalar_array_type ;
-
-  //------------------------------------
-  // Value type traits:
-
-  typedef typename data_analysis::value_type            value_type ;
-  typedef typename data_analysis::const_value_type      const_value_type ;
-  typedef typename data_analysis::non_const_value_type  non_const_value_type ;
-
-  //------------------------------------
-  // Mapping traits:
-
-  typedef ArrayLayout                         array_layout ;
-  typedef typename data_analysis::dimension   dimension ;
-  typedef typename data_analysis::specialize  specialize /* mapping specialization tag */ ;
-
-  enum { rank         = dimension::rank };
-  enum { rank_dynamic = dimension::rank_dynamic };
-
-  //------------------------------------
-  // Execution space, memory space, memory access traits, and host mirror space.
-
-  typedef ExecutionSpace                              execution_space ;
-  typedef MemorySpace                                 memory_space ;
-  typedef Kokkos::Device<ExecutionSpace,MemorySpace>  device_type ;
-  typedef MemoryTraits                                memory_traits ;
-  typedef HostMirrorSpace                             host_mirror_space ;
-
-  typedef typename MemorySpace::size_type  size_type ;
-
-  enum { is_hostspace      = std::is_same< MemorySpace , HostSpace >::value };
-  enum { is_managed        = MemoryTraits::Unmanaged    == 0 };
-  enum { is_random_access  = MemoryTraits::RandomAccess == 1 };
-
-  //------------------------------------
-};
-
-/** \class View
- *  \brief View to an array of data.
- *
- * A View represents an array of one or more dimensions.
- * For details, please refer to Kokkos' tutorial materials.
- *
- * \section Kokkos_View_TemplateParameters Template parameters
- *
- * This class has both required and optional template parameters.  The
- * \c DataType parameter must always be provided, and must always be
- * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
- * placeholders for different template parameters.  The default value
- * of the fifth template parameter \c Specialize suffices for most use
- * cases.  When explaining the template parameters, we won't refer to
- * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
- * to the valid categories of template parameters, in whatever order
- * they may occur.
- *
- * Valid ways in which template arguments may be specified:
- *   - View< DataType >
- *   - View< DataType , Layout >
- *   - View< DataType , Layout , Space >
- *   - View< DataType , Layout , Space , MemoryTraits >
- *   - View< DataType , Space >
- *   - View< DataType , Space , MemoryTraits >
- *   - View< DataType , MemoryTraits >
- *
- * \tparam DataType (required) This indicates both the type of each
- *   entry of the array, and the combination of compile-time and
- *   run-time array dimension(s).  For example, <tt>double*</tt>
- *   indicates a one-dimensional array of \c double with run-time
- *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
- *   with run-time first dimension and compile-time second dimension
- *   (of 3).  In general, the run-time dimensions (if any) must go
- *   first, followed by zero or more compile-time dimensions.  For
- *   more examples, please refer to the tutorial materials.
- *
- * \tparam Space (required) The memory space.
- *
- * \tparam Layout (optional) The array's layout in memory.  For
- *   example, LayoutLeft indicates a column-major (Fortran style)
- *   layout, and LayoutRight a row-major (C style) layout.  If not
- *   specified, this defaults to the preferred layout for the
- *   <tt>Space</tt>.
- *
- * \tparam MemoryTraits (optional) Assertion of the user's intended
- *   access behavior.  For example, RandomAccess indicates read-only
- *   access with limited spatial locality, and Unmanaged lets users
- *   wrap externally allocated memory in a View without automatic
- *   deallocation.
- *
- * \section Kokkos_View_MT MemoryTraits discussion
- *
- * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space
- *
- * Some \c MemoryTraits options may have different interpretations for
- * different \c Space types.  For example, with the Cuda device,
- * \c RandomAccess tells Kokkos to fetch the data through the texture
- * cache, whereas the non-GPU devices have no such hardware construct.
- *
- * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits
- *
- * Users should defer applying the optional \c MemoryTraits parameter
- * until the point at which they actually plan to rely on it in a
- * computational kernel.  This minimizes the number of template
- * parameters exposed in their code, which reduces the cost of
- * compilation.  Users may always assign a View without specified
- * \c MemoryTraits to a compatible View with that specification.
- * For example:
- * \code
- * // Pass in the simplest types of View possible.
- * void
- * doSomething (View<double*, Cuda> out,
- *              View<const double*, Cuda> in)
- * {
- *   // Assign the "generic" View in to a RandomAccess View in_rr.
- *   // Note that RandomAccess View objects must have const data.
- *   View<const double*, Cuda, RandomAccess> in_rr = in;
- *   // ... do something with in_rr and out ...
- * }
- * \endcode
- */
-template< class DataType , class ... Properties >
-class View ;
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#include <impl/KokkosExp_ViewMapping.hpp>
-#include <impl/KokkosExp_ViewArray.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-namespace {
-
-constexpr Kokkos::Experimental::Impl::ALL_t
-  ALL = Kokkos::Experimental::Impl::ALL_t();
-
-constexpr Kokkos::Experimental::Impl::WithoutInitializing_t
-  WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t();
-
-constexpr Kokkos::Experimental::Impl::AllowPadding_t       
-  AllowPadding        = Kokkos::Experimental::Impl::AllowPadding_t();
-
-}
-
-/** \brief  Create View allocation parameter bundle from argument list.
- *
- *  Valid argument list members are:
- *    1) label as a "string" or std::string
- *    2) memory space instance of the View::memory_space type
- *    3) execution space instance compatible with the View::memory_space
- *    4) Kokkos::WithoutInitializing to bypass initialization
- *    4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment
- */
-template< class ... Args >
-inline
-Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
-view_alloc( Args const & ... args )
-{
-  typedef 
-    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
-      return_type ;
-
-  static_assert( ! return_type::has_pointer
-               , "Cannot give pointer-to-memory for view allocation" );
-
-  return return_type( args... );
-}
-
-template< class ... Args >
-inline
-Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
-view_wrap( Args const & ... args )
-{
-  typedef 
-    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
-      return_type ;
-
-  static_assert( ! return_type::has_memory_space &&
-                 ! return_type::has_execution_space &&
-                 ! return_type::has_label &&
-                 return_type::has_pointer
-               , "Must only give pointer-to-memory for view wrapping" );
-
-  return return_type( args... );
-}
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-template< class DataType , class ... Properties >
-class View ;
-
-template< class > struct is_view : public std::false_type {};
-
-template< class D, class ... P >
-struct is_view< View<D,P...> > : public std::true_type {};
-
-template< class DataType , class ... Properties >
-class View : public ViewTraits< DataType , Properties ... > {
-private:
-
-  template< class , class ... > friend class View ;
-  template< class , class ... > friend class Impl::ViewMapping ;
-
-public:
-
-  typedef ViewTraits< DataType , Properties ... > traits ;
-
-private:
-
-  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
-  typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
-
-  track_type  m_track ;
-  map_type    m_map ;
-
-public:
-
-  //----------------------------------------
-  /** \brief  Compatible view of array of scalar types */
-  typedef View< typename traits::scalar_array_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > 
-    array_type ;
-
-  /** \brief  Compatible view of const data type */
-  typedef View< typename traits::const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > 
-    const_type ;
-
-  /** \brief  Compatible view of non-const data type */
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > 
-    non_const_type ;
-
-  /** \brief  Compatible HostMirror view */
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::host_mirror_space >
-    HostMirror ;
-
-  //----------------------------------------
-  // Domain rank and extents
-
-  enum { Rank = map_type::Rank };
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
-  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
-  extent( const iType & r ) const
-    { return m_map.extent(r); }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
-  typename std::enable_if< std::is_integral<iType>::value , int >::type
-  extent_int( const iType & r ) const
-    { return static_cast<int>(m_map.extent(r)); }
-
-  KOKKOS_INLINE_FUNCTION constexpr
-  typename traits::array_layout layout() const
-    { return m_map.layout(); }
-
-  //----------------------------------------
-  /*  Deprecate all 'dimension' functions in favor of
-   *  ISO/C++ vocabulary 'extent'.
-   */
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
-  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
-  dimension( const iType & r ) const { return extent( r ); }
-
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
-
-  //----------------------------------------
-
-  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
-                                                                m_map.dimension_1() *
-                                                                m_map.dimension_2() *
-                                                                m_map.dimension_3() *
-                                                                m_map.dimension_4() *
-                                                                m_map.dimension_5() *
-                                                                m_map.dimension_6() *
-                                                                m_map.dimension_7(); }
-
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
-  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
-
-  //----------------------------------------
-  // Range span is the span which contains all members.
-
-  typedef typename map_type::reference_type  reference_type ;
-  typedef typename map_type::pointer_type    pointer_type ;
-
-  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
-
-  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
-  // Deprecated, use 'span()' instead
-  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
-  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
-  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
-
-  // Deprecated, use 'span_is_contigous()' instead
-  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
-  // Deprecated, use 'data()' instead
-  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
-
-  //----------------------------------------
-  // Allow specializations to query their specialized map
-
-  KOKKOS_INLINE_FUNCTION
-  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
-  implementation_map() const { return m_map ; }
-
-  //----------------------------------------
-
-private:
-
-  enum {
-    is_layout_left = std::is_same< typename traits::array_layout
-                                  , Kokkos::LayoutLeft >::value ,
-
-    is_layout_right = std::is_same< typename traits::array_layout
-                                  , Kokkos::LayoutRight >::value ,
-
-    is_layout_stride = std::is_same< typename traits::array_layout
-                                   , Kokkos::LayoutStride >::value ,
-
-    is_default_map =
-      std::is_same< typename traits::specialize , void >::value &&
-      ( is_layout_left || is_layout_right || is_layout_stride )
-  };
-
-#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
-
-#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
-    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
-  Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ;
-
-#else
-
-#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
-    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
-
-#endif
-
-public:
-
-  //------------------------------
-  // Rank 0 operator()
-
-  template< class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value
-                            && ( 0 == Rank )
-                          ), reference_type >::type
-  operator()( Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,args...) )
-
-      return m_map.reference();
-    }
-
-  //------------------------------
-  // Rank 1 operator()
-
-  template< typename I0
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,Args...>::value
-      && ( 1 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
-
-      return m_map.reference(i0);
-    }
-
-  template< typename I0
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,Args...>::value
-      && ( 1 == Rank )
-      && is_default_map
-      && ! is_layout_stride
-    ), reference_type >::type
-  operator()( const I0 & i0
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
-
-      return m_map.m_handle[ i0 ];
-    }
-
-  template< typename I0
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,Args...>::value
-      && ( 1 == Rank )
-      && is_default_map
-      && is_layout_stride
-    ), reference_type >::type
-  operator()( const I0 & i0
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
-
-      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
-    }
-
-  //------------------------------
-  // Rank 1 operator[]
-
-  template< typename I0 >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0>::value
-      && ( 1 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator[]( const I0 & i0 ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
-
-      return m_map.reference(i0);
-    }
-
-  template< typename I0 >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0>::value
-      && ( 1 == Rank )
-      && is_default_map
-      && ! is_layout_stride
-    ), reference_type >::type
-  operator[]( const I0 & i0 ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
-
-      return m_map.m_handle[ i0 ];
-    }
-
-  template< typename I0 >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0>::value
-      && ( 1 == Rank )
-      && is_default_map
-      && is_layout_stride
-    ), reference_type >::type
-  operator[]( const I0 & i0 ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
-
-      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
-    }
-
-  //------------------------------
-  // Rank 2
-
-  template< typename I0 , typename I1
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
-      && ( 2 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
-
-      return m_map.reference(i0,i1);
-    }
-
-  template< typename I0 , typename I1
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
-      && ( 2 == Rank )
-      && is_default_map
-      && is_layout_left && ( traits::rank_dynamic == 0 )
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
-
-      return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
-    }
-
-  template< typename I0 , typename I1
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
-      && ( 2 == Rank )
-      && is_default_map
-      && is_layout_left && ( traits::rank_dynamic != 0 )
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
-
-      return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
-    }
-
-  template< typename I0 , typename I1
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
-      && ( 2 == Rank )
-      && is_default_map
-      && is_layout_right && ( traits::rank_dynamic == 0 )
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
-
-      return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
-    }
-
-  template< typename I0 , typename I1
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
-      && ( 2 == Rank )
-      && is_default_map
-      && is_layout_right && ( traits::rank_dynamic != 0 )
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
-
-      return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
-    }
-
-  template< typename I0 , typename I1
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
-      && ( 2 == Rank )
-      && is_default_map
-      && is_layout_stride
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
-
-      return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
-                             i1 * m_map.m_offset.m_stride.S1 ];
-    }
-
-  //------------------------------
-  // Rank 3
-
-  template< typename I0 , typename I1 , typename I2
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
-      && ( 3 == Rank )
-      && is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) )
-
-      return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
-    }
-
-  template< typename I0 , typename I1 , typename I2
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
-      && ( 3 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) )
-
-      return m_map.reference(i0,i1,i2);
-    }
-
-  //------------------------------
-  // Rank 4
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
-      && ( 4 == Rank )
-      && is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) )
-
-      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
-    }
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
-      && ( 4 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) )
-
-      return m_map.reference(i0,i1,i2,i3);
-    }
-
-  //------------------------------
-  // Rank 5
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
-      && ( 5 == Rank )
-      && is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) )
-
-      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
-    }
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
-      && ( 5 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) )
-
-      return m_map.reference(i0,i1,i2,i3,i4);
-    }
-
-  //------------------------------
-  // Rank 6
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4 , typename I5
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
-      && ( 6 == Rank )
-      && is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4 , const I5 & i5
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) )
-
-      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
-    }
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4 , typename I5
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
-      && ( 6 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4 , const I5 & i5
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) )
-
-      return m_map.reference(i0,i1,i2,i3,i4,i5);
-    }
-
-  //------------------------------
-  // Rank 7
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4 , typename I5 , typename I6
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
-      && ( 7 == Rank )
-      && is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4 , const I5 & i5 , const I6 & i6
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
-
-      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
-    }
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4 , typename I5 , typename I6
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
-      && ( 7 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4 , const I5 & i5 , const I6 & i6
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
-
-      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
-    }
-
-  //------------------------------
-  // Rank 8
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4 , typename I5 , typename I6 , typename I7
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
-      && ( 8 == Rank )
-      && is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
-
-      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
-    }
-
-  template< typename I0 , typename I1 , typename I2 , typename I3
-          , typename I4 , typename I5 , typename I6 , typename I7
-          , class ... Args >
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<
-    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
-      && ( 8 == Rank )
-      && ! is_default_map
-    ), reference_type >::type
-  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
-            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
-            , Args ... args ) const
-    {
-      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
-
-      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
-    }
-
-#undef KOKKOS_VIEW_OPERATOR_VERIFY
-
-  //----------------------------------------
-  // Standard destructor, constructors, and assignment operators
-
-  KOKKOS_INLINE_FUNCTION
-  ~View() {}
-
-  KOKKOS_INLINE_FUNCTION
-  View() : m_track(), m_map() {}
-
-  KOKKOS_INLINE_FUNCTION
-  View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
-
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
-
-  //----------------------------------------
-  // Compatible view copy constructor and assignment
-  // may assign unmanaged from managed.
-
-  template< class RT , class ... RP >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<RT,RP...> & rhs )
-    : m_track( rhs.m_track , traits::is_managed )
-    , m_map()
-    {
-      typedef typename View<RT,RP...>::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
-      static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
-      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
-    }
-
-  template< class RT , class ... RP >
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View<RT,RP...> & rhs )
-    {
-      typedef typename View<RT,RP...>::traits  SrcTraits ;
-      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
-      static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
-      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
-      m_track.assign( rhs.m_track , traits::is_managed );
-      return *this ;
-    }
-
-  //----------------------------------------
-  // Compatible subview constructor
-  // may assign unmanaged from managed.
-
-  template< class RT , class ... RP , class Arg0 , class ... Args >
-  KOKKOS_INLINE_FUNCTION
-  View( const View< RT , RP... > & src_view
-      , const Arg0 & arg0 , Args ... args )
-    : m_track( src_view.m_track , traits::is_managed )
-    , m_map()
-    {
-      typedef View< RT , RP... > SrcType ;
-
-      typedef Kokkos::Experimental::Impl::ViewMapping
-        < void /* deduce destination view type from source view traits */
-        , typename SrcType::traits
-        , Arg0 , Args... > Mapping ;
-
-      typedef typename Mapping::type DstType ;
-
-      static_assert( Kokkos::Experimental::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable
-        , "Subview construction requires compatible view and subview arguments" );
-
-      Mapping::assign( m_map, src_view.m_map, arg0 , args... );
-    }
-
-  //----------------------------------------
-  // Allocation tracking properties
-
-  KOKKOS_INLINE_FUNCTION
-  int use_count() const
-    { return m_track.use_count(); }
-
-  inline
-  const std::string label() const
-    { return m_track.template get_label< typename traits::memory_space >(); }
-
-  //----------------------------------------
-  // Allocation according to allocation properties and array layout
-
-  template< class ... P >
-  explicit inline
-  View( const Impl::ViewCtorProp< P ... > & arg_prop
-      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
-                               , typename traits::array_layout
-                               >::type const & arg_layout
-      )
-    : m_track()
-    , m_map()
-    {
-      // Append layout and spaces if not input
-      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
-
-      // use 'std::integral_constant<unsigned,I>' for non-types
-      // to avoid duplicate class error.
-      typedef Impl::ViewCtorProp
-        < P ...
-        , typename std::conditional
-            < alloc_prop_input::has_label
-            , std::integral_constant<unsigned,0>
-            , typename std::string
-            >::type
-        , typename std::conditional
-            < alloc_prop_input::has_memory_space
-            , std::integral_constant<unsigned,1>
-            , typename traits::device_type::memory_space
-            >::type
-        , typename std::conditional
-            < alloc_prop_input::has_execution_space
-            , std::integral_constant<unsigned,2>
-            , typename traits::device_type::execution_space
-            >::type
-        > alloc_prop ;
-
-      static_assert( traits::is_managed
-                   , "View allocation constructor requires managed memory" );
-
-      if ( alloc_prop::initialize &&
-           ! alloc_prop::execution_space::is_initialized() ) {
-        // If initializing view data then
-        // the execution space must be initialized.
-        Kokkos::Impl::throw_runtime_exception("Constructing View and initializing data with uninitialized execution space");
-      }
-
-      // Copy the input allocation properties with possibly defaulted properties
-      alloc_prop prop( arg_prop );
-
-//------------------------------------------------------------
-#if defined( KOKKOS_HAVE_CUDA )
-      // If allocating in CudaUVMSpace must fence before and after
-      // the allocation to protect against possible concurrent access
-      // on the CPU and the GPU.
-      // Fence using the trait's executon space (which will be Kokkos::Cuda)
-      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
-      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
-        traits::device_type::memory_space::execution_space::fence();
-      }
-#endif
-//------------------------------------------------------------
-
-      Kokkos::Experimental::Impl::SharedAllocationRecord<> *
-        record = m_map.allocate_shared( prop , arg_layout );
-
-//------------------------------------------------------------
-#if defined( KOKKOS_HAVE_CUDA )
-      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
-        traits::device_type::memory_space::execution_space::fence();
-      }
-#endif
-//------------------------------------------------------------
-
-      // Setup and initialization complete, start tracking
-      m_track.assign_allocated_record_to_uninitialized( record );
-    }
-
-  // Wrap memory according to properties and array layout
-  template< class ... P >
-  explicit KOKKOS_INLINE_FUNCTION
-  View( const Impl::ViewCtorProp< P ... > & arg_prop
-      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
-                               , typename traits::array_layout
-                               >::type const & arg_layout
-      )
-    : m_track() // No memory tracking
-    , m_map( arg_prop , arg_layout )
-    {
-      static_assert(
-        std::is_same< pointer_type
-                    , typename Impl::ViewCtorProp< P... >::pointer_type
-                    >::value ,
-        "Constructing View to wrap user memory must supply matching pointer type" );
-    }
-
-  // Simple dimension-only layout
-  template< class ... P >
-  explicit inline
-  View( const Impl::ViewCtorProp< P ... > & arg_prop
-      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
-                               , size_t 
-                               >::type const arg_N0 = 0 
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
-      )
-    : View( arg_prop
-          , typename traits::array_layout
-              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-          )
-    {}
-
-  template< class ... P >
-  explicit KOKKOS_INLINE_FUNCTION
-  View( const Impl::ViewCtorProp< P ... > & arg_prop
-      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
-                               , size_t 
-                               >::type const arg_N0 = 0 
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
-      )
-    : View( arg_prop
-          , typename traits::array_layout
-              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-          )
-    {}
-
-  // Allocate with label and layout
-  template< typename Label >
-  explicit inline
-  View( const Label & arg_label
-      , typename std::enable_if<
-          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
-          typename traits::array_layout >::type const & arg_layout
-      )
-    : View( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
-    {}
-
-  // Allocate label and layout, must disambiguate from subview constructor.
-  template< typename Label >
-  explicit inline
-  View( const Label & arg_label
-      , typename std::enable_if<
-          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
-        const size_t >::type arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
-      )
-    : View( Impl::ViewCtorProp< std::string >( arg_label )
-          , typename traits::array_layout
-              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-          )
-    {}
-
-  // For backward compatibility
-  explicit inline
-  View( const ViewAllocateWithoutInitializing & arg_prop
-      , const typename traits::array_layout & arg_layout
-      )
-    : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
-          , arg_layout
-          )
-    {}
-
-  explicit inline
-  View( const ViewAllocateWithoutInitializing & arg_prop
-      , const size_t arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
-      )
-    : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
-          , typename traits::array_layout
-              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-          )
-    {}
-
-  //----------------------------------------
-  // Memory span required to wrap these dimensions.
-  static constexpr size_t memory_span( const size_t arg_N0 = 0
-                                     , const size_t arg_N1 = 0
-                                     , const size_t arg_N2 = 0
-                                     , const size_t arg_N3 = 0
-                                     , const size_t arg_N4 = 0
-                                     , const size_t arg_N5 = 0
-                                     , const size_t arg_N6 = 0
-                                     , const size_t arg_N7 = 0
-                                     )
-    {
-      return map_type::memory_span(
-        typename traits::array_layout
-          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
-    }
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( pointer_type arg_ptr
-      , const size_t arg_N0 = 0
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0
-      )
-    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr)
-          , typename traits::array_layout
-             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-          )
-    {}
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( pointer_type arg_ptr
-      , const typename traits::array_layout & arg_layout
-      )
-    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout )
-    {}
-
-  //----------------------------------------
-  // Shared scratch memory constructor
-
-  static inline
-  size_t shmem_size( const size_t arg_N0 = 0 ,
-                     const size_t arg_N1 = 0 ,
-                     const size_t arg_N2 = 0 ,
-                     const size_t arg_N3 = 0 ,
-                     const size_t arg_N4 = 0 ,
-                     const size_t arg_N5 = 0 ,
-                     const size_t arg_N6 = 0 ,
-                     const size_t arg_N7 = 0 )
-  {
-    return map_type::memory_span(
-           typename traits::array_layout
-            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
-  }
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( const typename traits::execution_space::scratch_memory_space & arg_space
-      , const typename traits::array_layout & arg_layout )
-    : View( Impl::ViewCtorProp<pointer_type>(
-              reinterpret_cast<pointer_type>(
-                arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) )
-         , arg_layout )
-    {}
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( const typename traits::execution_space::scratch_memory_space & arg_space
-      , const size_t arg_N0 = 0 
-      , const size_t arg_N1 = 0
-      , const size_t arg_N2 = 0
-      , const size_t arg_N3 = 0
-      , const size_t arg_N4 = 0
-      , const size_t arg_N5 = 0
-      , const size_t arg_N6 = 0
-      , const size_t arg_N7 = 0 )
-    : View( Impl::ViewCtorProp<pointer_type>(
-              reinterpret_cast<pointer_type>(
-                arg_space.get_shmem(
-                  map_type::memory_span(
-                    typename traits::array_layout
-                     ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
-          , typename traits::array_layout
-             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
-             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-       )
-    {}
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class V , class ... Args >
-using Subview =
-  typename Kokkos::Experimental::Impl::ViewMapping
-    < void /* deduce subview type from source view traits */
-    , typename V::traits
-    , Args ...
-    >::type ;
-
-template< class D, class ... P , class ... Args >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Experimental::Impl::ViewMapping
-  < void /* deduce subview type from source view traits */
-  , ViewTraits< D , P... >
-  , Args ...
-  >::type
-subview( const View< D, P... > & src , Args ... args )
-{
-  static_assert( View< D , P... >::Rank == sizeof...(Args) , 
-    "subview requires one argument for each source View rank" );
-
-  return typename
-    Kokkos::Experimental::Impl::ViewMapping
-      < void /* deduce subview type from source view traits */
-      , ViewTraits< D , P ... >
-      , Args ... >::type( src , args ... ); 
-}
-
-template< class MemoryTraits , class D, class ... P , class ... Args >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Experimental::Impl::ViewMapping
-  < void /* deduce subview type from source view traits */
-  , ViewTraits< D , P... >
-  , Args ...
-  >::template apply< MemoryTraits >::type
-subview( const View< D, P... > & src , Args ... args )
-{
-  static_assert( View< D , P... >::Rank == sizeof...(Args) , 
-    "subview requires one argument for each source View rank" );
-
-  return typename
-    Kokkos::Experimental::Impl::ViewMapping
-      < void /* deduce subview type from source view traits */
-      , ViewTraits< D , P ... >
-      , Args ... >
-      ::template apply< MemoryTraits >
-      ::type( src , args ... ); 
-}
-
-
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-template< class LT , class ... LP , class RT , class ... RP >
-KOKKOS_INLINE_FUNCTION
-bool operator == ( const View<LT,LP...> & lhs ,
-                   const View<RT,RP...> & rhs )
-{
-  // Same data, layout, dimensions
-  typedef ViewTraits<LT,LP...>  lhs_traits ;
-  typedef ViewTraits<RT,RP...>  rhs_traits ;
-
-  return
-    std::is_same< typename lhs_traits::const_value_type ,
-                  typename rhs_traits::const_value_type >::value &&
-    std::is_same< typename lhs_traits::array_layout ,
-                  typename rhs_traits::array_layout >::value &&
-    std::is_same< typename lhs_traits::memory_space ,
-                  typename rhs_traits::memory_space >::value &&
-    unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) &&
-    lhs.data()        == rhs.data() &&
-    lhs.span()        == rhs.span() &&
-    lhs.dimension_0() == rhs.dimension_0() &&
-    lhs.dimension_1() == rhs.dimension_1() &&
-    lhs.dimension_2() == rhs.dimension_2() &&
-    lhs.dimension_3() == rhs.dimension_3() &&
-    lhs.dimension_4() == rhs.dimension_4() &&
-    lhs.dimension_5() == rhs.dimension_5() &&
-    lhs.dimension_6() == rhs.dimension_6() &&
-    lhs.dimension_7() == rhs.dimension_7();
-}
-
-template< class LT , class ... LP , class RT , class ... RP >
-KOKKOS_INLINE_FUNCTION
-bool operator != ( const View<LT,LP...> & lhs ,
-                   const View<RT,RP...> & rhs )
-{
-  return ! ( operator==(lhs,rhs) );
-}
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-#if KOKKOS_USING_EXP_VIEW
-
-inline
-void shared_allocation_tracking_claim_and_disable()
-{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
-
-inline
-void shared_allocation_tracking_release_and_enable()
-{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
-
-#else
-
-inline
-void shared_allocation_tracking_claim_and_disable()
-{ Kokkos::Impl::AllocationTracker::disable_tracking(); }
-
-inline
-void shared_allocation_tracking_release_and_enable()
-{ Kokkos::Impl::AllocationTracker::enable_tracking(); }
-
-#endif
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-template< class OutputView , typename Enable = void >
-struct ViewFill {
-
-  typedef typename OutputView::const_value_type  const_value_type ;
-
-  const OutputView output ;
-  const_value_type input ;
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_t i0 ) const
-  {
-    const size_t n1 = output.dimension_1();
-    const size_t n2 = output.dimension_2();
-    const size_t n3 = output.dimension_3();
-    const size_t n4 = output.dimension_4();
-    const size_t n5 = output.dimension_5();
-    const size_t n6 = output.dimension_6();
-    const size_t n7 = output.dimension_7();
-
-    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
-    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
-    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
-    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
-    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
-    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
-    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
-      output(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
-    }}}}}}}
-  }
-
-  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
-    : output( arg_out ), input( arg_in )
-    {
-      typedef typename OutputView::execution_space  execution_space ;
-      typedef Kokkos::RangePolicy< execution_space > Policy ;
-
-      const Kokkos::Impl::ParallelFor< ViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) );
-
-      closure.execute();
-
-      execution_space::fence();
-    }
-};
-
-template< class OutputView >
-struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > {
-  ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
-    {
-      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
-        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
-    }
-};
-
-template< class OutputView , class InputView >
-struct ViewRemap {
-
-  const OutputView output ;
-  const InputView  input ;
-  const size_t n0 ;
-  const size_t n1 ;
-  const size_t n2 ;
-  const size_t n3 ;
-  const size_t n4 ;
-  const size_t n5 ;
-  const size_t n6 ;
-  const size_t n7 ;
-
-  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
-    : output( arg_out ), input( arg_in )
-    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
-    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
-    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
-    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
-    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
-    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
-    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
-    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
-    {
-      typedef typename OutputView::execution_space execution_space ;
-      typedef Kokkos::RangePolicy< execution_space > Policy ;
-      const Kokkos::Impl::ParallelFor< ViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
-      closure.execute();
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_t i0 ) const
-  {
-    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
-    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
-    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
-    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
-    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
-    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
-    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
-      output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7);
-    }}}}}}}
-  }
-};
-
-} /* namespace Impl */
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-/** \brief  Deep copy a value from Host memory into a view.  */
-template< class DT , class ... DP >
-inline
-void deep_copy
-  ( const View<DT,DP...> & dst
-  , typename ViewTraits<DT,DP...>::const_value_type & value
-  , typename std::enable_if<
-    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
-    >::type * = 0 )
-{
-  static_assert(
-    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
-                  typename ViewTraits<DT,DP...>::value_type >::value
-    , "deep_copy requires non-const type" );
-
-  Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value );
-}
-
-/** \brief  Deep copy into a value in Host memory from a view.  */
-template< class ST , class ... SP >
-inline
-void deep_copy
-  ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
-  , const View<ST,SP...> & src
-  , typename std::enable_if<
-    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
-    >::type * = 0 )
-{
-  static_assert( ViewTraits<ST,SP...>::rank == 0 
-               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
-
-  typedef ViewTraits<ST,SP...>               src_traits ;
-  typedef typename src_traits::memory_space  src_memory_space ;
-  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
-}
-
-//----------------------------------------------------------------------------
-/** \brief  A deep copy between views of compatible type, and rank zero.  */
-template< class DT , class ... DP , class ST , class ... SP >
-inline
-void deep_copy
-  ( const View<DT,DP...> & dst
-  , const View<ST,SP...> & src
-  , typename std::enable_if<(
-    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
-    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
-    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
-      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
-  )>::type * = 0 )
-{
-  static_assert(
-    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
-                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
-    , "deep_copy requires matching non-const destination type" );
-
-  typedef View<DT,DP...>  dst_type ;
-  typedef View<ST,SP...>  src_type ;
-
-  typedef typename dst_type::value_type    value_type ;
-  typedef typename dst_type::memory_space  dst_memory_space ;
-  typedef typename src_type::memory_space  src_memory_space ;
-
-  if ( dst.data() != src.data() ) {
-    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) );
-  }
-}
-
-//----------------------------------------------------------------------------
-/** \brief  A deep copy between views of the default specialization, compatible type,
- *          same non-zero rank, same contiguous layout.
- */
-template< class DT , class ... DP , class ST , class ... SP >
-inline
-void deep_copy
-  ( const View<DT,DP...> & dst
-  , const View<ST,SP...> & src
-  , typename std::enable_if<(
-    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
-    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
-    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
-      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
-  )>::type * = 0 )
-{
-  static_assert(
-    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
-                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
-    , "deep_copy requires non-const destination type" );
-
-  static_assert(
-    ( unsigned(ViewTraits<DT,DP...>::rank) ==
-      unsigned(ViewTraits<ST,SP...>::rank) )
-    , "deep_copy requires Views of equal rank" );
-
-  typedef View<DT,DP...>  dst_type ;
-  typedef View<ST,SP...>  src_type ;
-
-  typedef typename dst_type::execution_space  dst_execution_space ;
-  typedef typename dst_type::memory_space     dst_memory_space ;
-  typedef typename src_type::memory_space     src_memory_space ;
-
-  enum { DstExecCanAccessSrc =
-   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
-
-  if ( (void *) dst.data() != (void*) src.data() ) {
-
-    // Concern: If overlapping views then a parallel copy will be erroneous.
-    // ...
-
-    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
-
-    if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
-                       typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
-         (
-           std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
-                         typename ViewTraits<ST,SP...>::array_layout >::value
-           ||
-           ( ViewTraits<DT,DP...>::rank == 1 &&
-             ViewTraits<ST,SP...>::rank == 1 )
-         ) &&
-         dst.span_is_contiguous() &&
-         src.span_is_contiguous() &&
-         dst.span() == src.span() &&
-         dst.dimension_0() == src.dimension_0() &&
-         dst.dimension_1() == src.dimension_1() &&
-         dst.dimension_2() == src.dimension_2() &&
-         dst.dimension_3() == src.dimension_3() &&
-         dst.dimension_4() == src.dimension_4() &&
-         dst.dimension_5() == src.dimension_5() &&
-         dst.dimension_6() == src.dimension_6() &&
-         dst.dimension_7() == src.dimension_7() ) {
-
-      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-
-      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
-    }
-    else if ( DstExecCanAccessSrc ) {
-      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
-    }
-    else {
-      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
-    }
-  }
-}
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-/** \brief  Deep copy a value from Host memory into a view.  */
-template< class ExecSpace ,class DT , class ... DP >
-inline
-void deep_copy
-  ( const ExecSpace &
-  , const View<DT,DP...> & dst
-  , typename ViewTraits<DT,DP...>::const_value_type & value
-  , typename std::enable_if<
-    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
-    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
-    >::type * = 0 )
-{
-  static_assert(
-    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
-                  typename ViewTraits<DT,DP...>::value_type >::value
-    , "deep_copy requires non-const type" );
-
-  Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value );
-}
-
-/** \brief  Deep copy into a value in Host memory from a view.  */
-template< class ExecSpace , class ST , class ... SP >
-inline
-void deep_copy
-  ( const ExecSpace & exec_space
-  , typename ViewTraits<ST,SP...>::non_const_value_type & dst
-  , const View<ST,SP...> & src
-  , typename std::enable_if<
-    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
-    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
-    >::type * = 0 )
-{
-  static_assert( ViewTraits<ST,SP...>::rank == 0 
-               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
-
-  typedef ViewTraits<ST,SP...>               src_traits ;
-  typedef typename src_traits::memory_space  src_memory_space ;
-  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace >
-    ( exec_space , & dst , src.data() , sizeof(ST) );
-}
-
-//----------------------------------------------------------------------------
-/** \brief  A deep copy between views of compatible type, and rank zero.  */
-template< class ExecSpace , class DT , class ... DP , class ST , class ... SP >
-inline
-void deep_copy
-  ( const ExecSpace & exec_space
-  , const View<DT,DP...> & dst
-  , const View<ST,SP...> & src
-  , typename std::enable_if<(
-    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
-    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
-    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
-    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
-      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
-  )>::type * = 0 )
-{
-  static_assert(
-    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
-                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
-    , "deep_copy requires matching non-const destination type" );
-
-  typedef View<DT,DP...>  dst_type ;
-  typedef View<ST,SP...>  src_type ;
-
-  typedef typename dst_type::value_type    value_type ;
-  typedef typename dst_type::memory_space  dst_memory_space ;
-  typedef typename src_type::memory_space  src_memory_space ;
-
-  if ( dst.data() != src.data() ) {
-    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
-      ( exec_space , dst.data() , src.data() , sizeof(value_type) );
-  }
-}
-
-//----------------------------------------------------------------------------
-/** \brief  A deep copy between views of the default specialization, compatible type,
- *          same non-zero rank, same contiguous layout.
- */
-template< class ExecSpace , class DT, class ... DP, class ST, class ... SP >
-inline
-void deep_copy
-  ( const ExecSpace & exec_space
-  , const View<DT,DP...> & dst
-  , const View<ST,SP...> & src
-  , typename std::enable_if<(
-    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
-    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
-    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
-    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
-      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
-  )>::type * = 0 )
-{
-  static_assert(
-    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
-                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
-    , "deep_copy requires non-const destination type" );
-
-  static_assert(
-    ( unsigned(ViewTraits<DT,DP...>::rank) ==
-      unsigned(ViewTraits<ST,SP...>::rank) )
-    , "deep_copy requires Views of equal rank" );
-
-  typedef View<DT,DP...>  dst_type ;
-  typedef View<ST,SP...>  src_type ;
-
-  typedef typename dst_type::execution_space  dst_execution_space ;
-  typedef typename dst_type::memory_space     dst_memory_space ;
-  typedef typename src_type::memory_space     src_memory_space ;
-
-  enum { DstExecCanAccessSrc =
-   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
-
-  if ( (void *) dst.data() != (void*) src.data() ) {
-
-    // Concern: If overlapping views then a parallel copy will be erroneous.
-    // ...
-
-    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
-
-    if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
-                       typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
-         (
-           std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
-                         typename ViewTraits<ST,SP...>::array_layout >::value
-           ||
-           ( ViewTraits<DT,DP...>::rank == 1 &&
-             ViewTraits<ST,SP...>::rank == 1 )
-         ) &&
-         dst.span_is_contiguous() &&
-         src.span_is_contiguous() &&
-         dst.span() == src.span() &&
-         dst.dimension_0() == src.dimension_0() &&
-         dst.dimension_1() == src.dimension_1() &&
-         dst.dimension_2() == src.dimension_2() &&
-         dst.dimension_3() == src.dimension_3() &&
-         dst.dimension_4() == src.dimension_4() &&
-         dst.dimension_5() == src.dimension_5() &&
-         dst.dimension_6() == src.dimension_6() &&
-         dst.dimension_7() == src.dimension_7() ) {
-
-      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-
-      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
-        ( exec_space , dst.data() , src.data() , nbytes );
-    }
-    else if ( DstExecCanAccessSrc ) {
-      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
-      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
-    }
-    else {
-      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
-    }
-  }
-}
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-// Deduce Mirror Types
-template<class Space, class T, class ... P>
-struct MirrorViewType {
-  // The incoming view_type
-  typedef typename Kokkos::Experimental::View<T,P...> src_view_type;
-  // The memory space for the mirror view
-  typedef typename Space::memory_space memory_space;
-  // Check whether it is the same memory space
-  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
-  // The array_layout
-  typedef typename src_view_type::array_layout array_layout;
-  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
-  typedef typename src_view_type::non_const_data_type data_type;
-  // The destination view type if it is not the same memory space
-  typedef Kokkos::Experimental::View<data_type,array_layout,Space> dest_view_type;
-  // If it is the same memory_space return the existsing view_type
-  // This will also keep the unmanaged trait if necessary
-  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
-};
-
-template<class Space, class T, class ... P>
-struct MirrorType {
-  // The incoming view_type
-  typedef typename Kokkos::Experimental::View<T,P...> src_view_type;
-  // The memory space for the mirror view
-  typedef typename Space::memory_space memory_space;
-  // Check whether it is the same memory space
-  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
-  // The array_layout
-  typedef typename src_view_type::array_layout array_layout;
-  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
-  typedef typename src_view_type::non_const_data_type data_type;
-  // The destination view type if it is not the same memory space
-  typedef Kokkos::Experimental::View<data_type,array_layout,Space> view_type;
-};
-
-}
-
-template< class T , class ... P >
-inline
-typename Kokkos::Experimental::View<T,P...>::HostMirror
-create_mirror( const Kokkos::Experimental::View<T,P...> & src
-             , typename std::enable_if<
-                 ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
-                               , Kokkos::LayoutStride >::value
-               >::type * = 0
-             )
-{
-  typedef View<T,P...>                   src_type ;
-  typedef typename src_type::HostMirror  dst_type ;
-
-  return dst_type( std::string( src.label() ).append("_mirror")
-                 , src.dimension_0()
-                 , src.dimension_1()
-                 , src.dimension_2()
-                 , src.dimension_3()
-                 , src.dimension_4()
-                 , src.dimension_5()
-                 , src.dimension_6()
-                 , src.dimension_7() );
-}
-
-template< class T , class ... P >
-inline
-typename Kokkos::Experimental::View<T,P...>::HostMirror
-create_mirror( const Kokkos::Experimental::View<T,P...> & src
-             , typename std::enable_if<
-                 std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
-                             , Kokkos::LayoutStride >::value
-               >::type * = 0
-             )
-{
-  typedef View<T,P...>                   src_type ;
-  typedef typename src_type::HostMirror  dst_type ;
-
-  Kokkos::LayoutStride layout ;
-
-  layout.dimension[0] = src.dimension_0();
-  layout.dimension[1] = src.dimension_1();
-  layout.dimension[2] = src.dimension_2();
-  layout.dimension[3] = src.dimension_3();
-  layout.dimension[4] = src.dimension_4();
-  layout.dimension[5] = src.dimension_5();
-  layout.dimension[6] = src.dimension_6();
-  layout.dimension[7] = src.dimension_7();
-
-  layout.stride[0] = src.stride_0();
-  layout.stride[1] = src.stride_1();
-  layout.stride[2] = src.stride_2();
-  layout.stride[3] = src.stride_3();
-  layout.stride[4] = src.stride_4();
-  layout.stride[5] = src.stride_5();
-  layout.stride[6] = src.stride_6();
-  layout.stride[7] = src.stride_7();
-
-  return dst_type( std::string( src.label() ).append("_mirror") , layout );
-}
-
-
-// Create a mirror in a new space (specialization for different space)
-template<class Space, class T, class ... P>
-typename Impl::MirrorType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::View<T,P...> & src) {
-  return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
-}
-
-template< class T , class ... P >
-inline
-typename Kokkos::Experimental::View<T,P...>::HostMirror
-create_mirror_view( const Kokkos::Experimental::View<T,P...> & src
-                  , typename std::enable_if<(
-                      std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space
-                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space
-                                  >::value
-                      &&
-                      std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type
-                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type
-                                  >::value
-                    )>::type * = 0 
-                  )
-{
-  return src ;
-}
-
-template< class T , class ... P >
-inline
-typename Kokkos::Experimental::View<T,P...>::HostMirror
-create_mirror_view( const Kokkos::Experimental::View<T,P...> & src
-                  , typename std::enable_if< ! (
-                      std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space
-                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space
-                                  >::value
-                      &&
-                      std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type
-                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type
-                                  >::value
-                    )>::type * = 0 
-                  )
-{
-  return Kokkos::Experimental::create_mirror( src );
-}
-
-// Create a mirror view in a new space (specialization for same space)
-template<class Space, class T, class ... P>
-typename Impl::MirrorViewType<Space,T,P ...>::view_type
-create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src
-  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
-  return src;
-}
-
-// Create a mirror view in a new space (specialization for different space)
-template<class Space, class T, class ... P>
-typename Impl::MirrorViewType<Space,T,P ...>::view_type
-create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src
-  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
-  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
-}
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-
-/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
-template< class T , class ... P >
-inline
-void resize( Kokkos::Experimental::View<T,P...> & v ,
-             const size_t n0 = 0 ,
-             const size_t n1 = 0 ,
-             const size_t n2 = 0 ,
-             const size_t n3 = 0 ,
-             const size_t n4 = 0 ,
-             const size_t n5 = 0 ,
-             const size_t n6 = 0 ,
-             const size_t n7 = 0 )
-{
-  typedef Kokkos::Experimental::View<T,P...>  view_type ;
-
-  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
-
-  view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
-
-  Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v );
-
-  v = v_resized ;
-}
-
-/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
-template< class T , class ... P >
-inline
-void realloc( Kokkos::Experimental::View<T,P...> & v ,
-              const size_t n0 = 0 ,
-              const size_t n1 = 0 ,
-              const size_t n2 = 0 ,
-              const size_t n3 = 0 ,
-              const size_t n4 = 0 ,
-              const size_t n5 = 0 ,
-              const size_t n6 = 0 ,
-              const size_t n7 = 0 )
-{
-  typedef Kokkos::Experimental::View<T,P...>  view_type ;
-
-  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
-
-  const std::string label = v.label();
-
-  v = view_type(); // Deallocate first, if the only view to allocation
-  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
-}
-
-} /* namespace Experimental */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if KOKKOS_USING_EXP_VIEW
-
-namespace Kokkos {
-
-template< class D , class ... P >
-using ViewTraits = Kokkos::Experimental::ViewTraits<D,P...> ;
-
-template< class D , class ... P >
-using View = Kokkos::Experimental::View<D,P...> ;
-
-using Kokkos::Experimental::ALL ;
-using Kokkos::Experimental::deep_copy ;
-using Kokkos::Experimental::create_mirror ;
-using Kokkos::Experimental::create_mirror_view ;
-using Kokkos::Experimental::subview ;
-using Kokkos::Experimental::resize ;
-using Kokkos::Experimental::realloc ;
-using Kokkos::Experimental::is_view ;
-
-namespace Impl {
-
-using Kokkos::Experimental::is_view ;
-
-class ViewDefault {};
-
-template< class SrcViewType
-        , class Arg0Type
-        , class Arg1Type
-        , class Arg2Type
-        , class Arg3Type
-        , class Arg4Type
-        , class Arg5Type
-        , class Arg6Type
-        , class Arg7Type
-        >
-struct ViewSubview /* { typedef ... type ; } */ ;
-
-}
-
-} /* namespace Kokkos */
-
-#include <impl/Kokkos_Atomic_View.hpp>
-
-#endif /* #if KOKKOS_USING_EXP_VIEW */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 11aaf96177..cdfa4429f0 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -121,13 +121,22 @@ public:
     return *this;
   }
 
-  //! Assignment operator.
+  /// \brief Assignment operator, for volatile <tt>*this</tt> and
+  ///   nonvolatile input.
+  ///
+  /// \param src [in] Input; right-hand side of the assignment.
+  ///
+  /// This operator returns \c void instead of <tt>volatile
+  /// complex<RealType>& </tt>.  See Kokkos Issue #177 for the
+  /// explanation.  In practice, this means that you should not chain
+  /// assignments with volatile lvalues.
   template<class InputRealType>
   KOKKOS_INLINE_FUNCTION
-  volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
+  void operator= (const complex<InputRealType>& src) volatile {
     re_ = src.re_;
     im_ = src.im_;
-    return *this;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
   }
 
   //! Assignment operator.
diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
similarity index 56%
rename from lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
rename to lib/kokkos/core/src/Kokkos_Concepts.hpp
index ad3e0b35a5..82a342eec0 100644
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,86 +36,43 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
-#define KOKKOS_BASIC_ALLOCATORS_HPP
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace Kokkos { namespace Impl {
-
-/// class UnmanagedAllocator
-/// does nothing when deallocate(ptr,size) is called
-class UnmanagedAllocator
-{
-public:
-  static const char * name() { return "Unmanaged Allocator"; }
-
-  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
-};
-
-
-/// class MallocAllocator
-class MallocAllocator
-{
-public:
-  static const char * name()
-  {
-    return "Malloc Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t size);
+#ifndef KOKKOS_CORE_CONCEPTS_HPP
+#define KOKKOS_CORE_CONCEPTS_HPP
 
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
-};
+#include <type_traits>
 
+namespace Kokkos {
+//Schedules for Execution Policies
+struct Static {};
+struct Dynamic {};
 
-/// class AlignedAllocator
-/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
-class AlignedAllocator
+//Schedule Wrapper Type
+template<class T>
+struct Schedule
 {
-public:
-  static const char * name()
-  {
-    return "Aligned Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t size);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+  static_assert(  std::is_same<T,Static>::value
+               || std::is_same<T,Dynamic>::value
+               , "Kokkos: Invalid Schedule<> type."
+               );
+  using schedule_type = Schedule<T>;
+  using type = T;
 };
 
-
-/// class PageAlignedAllocator
-/// memory aligned to PAGE_SIZE
-class PageAlignedAllocator
+//Specify Iteration Index Type
+template<typename T>
+struct IndexType
 {
-public:
-  static const char * name()
-  {
-    return "Page Aligned Allocator";
-  }
-
-  static void* allocate(size_t size);
-
-  static void deallocate(void * ptr, size_t size);
-
-  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+  static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
+  using index_type = IndexType<T>;
+  using type = T;
 };
 
+} // namespace Kokkos
 
-}} // namespace Kokkos::Impl
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#endif //KOKKOS_BASIC_ALLOCATORS_HPP
-
+#endif // KOKKOS_CORE_CONCEPTS_HPP
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index e4a4643ce5..7cde4610ee 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 } // namespace Kokkos
 
 
-#if KOKKOS_USING_EXP_VIEW
-
 namespace Kokkos {
 
 using Kokkos::Experimental::kokkos_malloc ;
@@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ;
 
 }
 
-#else
-
-namespace Kokkos {
-
-namespace Impl {
-// should only by used by kokkos_malloc and kokkos_free
-struct MallocHelper
-{
-  static void increment_ref_count( AllocationTracker const & tracker )
-  {
-    tracker.increment_ref_count();
-  }
-
-  static void decrement_ref_count( AllocationTracker const & tracker )
-  {
-    tracker.decrement_ref_count();
-  }
-};
-} // namespace Impl
-
-/* Allocate memory from a memory space.
- * The allocation is tracked in Kokkos memory tracking system, so
- * leaked memory can be identified.
- */
-template< class Arg = DefaultExecutionSpace>
-void* kokkos_malloc(const std::string label, size_t count) {
-  if(count == 0) return NULL;
-  typedef typename Arg::memory_space MemorySpace;
-  Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
-  Impl::MallocHelper::increment_ref_count( tracker );
-  return tracker.alloc_ptr();
-}
-
-template< class Arg = DefaultExecutionSpace>
-void* kokkos_malloc(const size_t& count) {
-  return kokkos_malloc<Arg>("DefaultLabel",count);
-}
-
-
-/* Free memory from a memory space.
- */
-template< class Arg = DefaultExecutionSpace>
-void kokkos_free(const void* ptr) {
-  typedef typename Arg::memory_space MemorySpace;
-  typedef typename MemorySpace::allocator allocator;
-  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
-  if (tracker.is_valid()) {
-    Impl::MallocHelper::decrement_ref_count( tracker );
-  }
-}
-
-
-template< class Arg = DefaultExecutionSpace>
-void* kokkos_realloc(const void* old_ptr, size_t size) {
-  if(old_ptr == NULL)
-    return kokkos_malloc<Arg>(size);
-
-  typedef typename Arg::memory_space MemorySpace;
-  typedef typename MemorySpace::allocator allocator;
-  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
-
-  tracker.reallocate(size);
-
-  return tracker.alloc_ptr();
-}
-
-} // namespace Kokkos
-
-#endif
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index a262864157..e9648b59b8 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -69,6 +69,9 @@ namespace {
 /**\brief Token to indicate that a parameter's value is to be automatically selected */
 constexpr AUTO_t AUTO = Kokkos::AUTO_t();
 }
+
+struct InvalidType {};
+
 }
 
 //----------------------------------------------------------------------------
@@ -205,7 +208,7 @@ namespace Impl {
 template< class Functor
         , class Policy
         , class EnableFunctor = void 
-	, class EnablePolicy = void
+	      , class EnablePolicy = void
         >
 struct FunctorPolicyExecutionSpace;
 
@@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy , class ExecutionSpace = 
+template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
           typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space 
         > class ParallelReduce ;
 
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
index eadcf13256..3130ee3198 100644
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -56,11 +56,14 @@
 #include <Kokkos_CudaSpace.hpp>
 
 #include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
+
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
@@ -108,7 +111,7 @@ public:
   //! This execution space's preferred array layout.
   typedef LayoutLeft            array_layout ;
 
-  //! 
+  //!
   typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
 
   //@}
@@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_CudaExec.hpp>
 #include <Cuda/Kokkos_Cuda_View.hpp>
 
-#include <KokkosExp_View.hpp>
 #include <Cuda/KokkosExp_Cuda_View.hpp>
 
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
+#include <Cuda/Kokkos_Cuda_Task.hpp>
 
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
index c0223c35cf..cd728895d0 100644
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -54,10 +54,7 @@
 
 #include <Kokkos_HostSpace.hpp>
 
-#include <impl/Kokkos_AllocationTracker.hpp>
-
 #include <Cuda/Kokkos_Cuda_abort.hpp>
-#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -77,33 +74,6 @@ public:
 
   /*--------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::CudaMallocAllocator allocator;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-  /*--------------------------------*/
-  /** \brief  Cuda specific function to attached texture object to an allocation.
-   *          Output the texture object, base pointer, and offset from the input pointer.
-   */
-#if defined( __CUDACC__ )
-  static void texture_object_attach(  Impl::AllocationTracker const & tracker
-                                    , unsigned type_size
-                                    , ::cudaChannelFormatDesc const & desc
-                                   );
-#endif
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-  /*--------------------------------*/
-
   CudaSpace();
   CudaSpace( CudaSpace && rhs ) = default ;
   CudaSpace( const CudaSpace & rhs ) = default ;
@@ -137,7 +107,7 @@ namespace Impl {
 /// where the hash value is derived from the address of the
 /// object for which an atomic operation is performed.
 /// This function initializes the locks to zero (unset).
-void init_lock_array_cuda_space();
+void init_lock_arrays_cuda_space();
 
 /// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
 ///
@@ -146,7 +116,23 @@ void init_lock_array_cuda_space();
 /// object for which an atomic operation is performed.
 /// This function retrieves the lock array pointer.
 /// If the array is not yet allocated it will do so.
-int* lock_array_cuda_space_ptr(bool deallocate = false);
+int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-Cuda::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
 }
 } // namespace Kokkos
 
@@ -172,33 +158,6 @@ public:
 
   /*--------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::CudaUVMAllocator allocator;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-
-  /** \brief  Cuda specific function to attached texture object to an allocation.
-   *          Output the texture object, base pointer, and offset from the input pointer.
-   */
-#if defined( __CUDACC__ )
-  static void texture_object_attach(  Impl::AllocationTracker const & tracker
-                                    , unsigned type_size
-                                    , ::cudaChannelFormatDesc const & desc
-                                   );
-#endif
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-  /*--------------------------------*/
-
   CudaUVMSpace();
   CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
   CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
@@ -242,22 +201,6 @@ public:
 
   /*--------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::CudaHostAllocator allocator ;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-  /*--------------------------------*/
-
   CudaHostPinnedSpace();
   CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
   CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 8489978f54..5834fc04db 100644
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -47,167 +47,15 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_AnalyzePolicy.hpp>
+#include <Kokkos_Concepts.hpp>
 #include <iostream>
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 
-//Schedules for Execution Policies
-struct Static {
-};
-
-struct Dynamic {
-};
-
-//Schedule Wrapper Type
-template<class ScheduleType>
-struct Schedule {
-  static_assert(std::is_same<ScheduleType,Static>::value ||
-                std::is_same<ScheduleType,Dynamic>::value,
-                "Kokkos: Invalid Schedule<> type.");
-  typedef Schedule<ScheduleType> schedule_type;
-  typedef ScheduleType type;
-};
-
-//Specif Iteration Index Type
-template<typename iType>
-struct IndexType {
-  static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>.");
-  typedef IndexType<iType> index_type;
-  typedef iType type;
-};
-
-namespace Impl {
-
-template<class Arg>
-struct is_schedule_type {
-  enum { value = 0};
-};
-
-template<class ScheduleType>
-struct is_schedule_type<Schedule<ScheduleType> > {
-  enum {value = 1 };
-};
-
-template<class Arg>
-struct is_index_type {
-  enum { value = 0 };
-};
-
-template<typename iType>
-struct is_index_type<IndexType<iType> > {
-  enum { value = 1 };
-};
-
-template<typename Arg>
-struct is_tag_type {
-  enum { value = !(is_execution_space<Arg>::value ||
-                   is_schedule_type<Arg>::value ||
-                   is_index_type<Arg>::value ||
-                   std::is_integral<Arg>::value)};
-};
-
-//Policy Traits
-template<class ... Properties>
-struct PolicyTraits;
-
-template<>
-struct PolicyTraits<void> {
-  typedef void execution_space;
-  typedef void schedule_type;
-  typedef void index_type;
-  typedef void tag_type;
-};
-
-
-//Strip off ExecutionSpace
-template<class ExecutionSpace, class ... Props>
-struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value,
-                 "ExecutionPolicy: Only one execution space template argument may be used.");
-  typedef ExecutionSpace execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off ScheduleType
-template<class ScheduleType, class ... Props>
-struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value,
-                 "ExecutionPolicy: Only one Schedule<..> template argument may be used.");
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef ScheduleType schedule_type;
-  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off IndexType
-template<typename iType, class ... Props>
-struct PolicyTraits<void, IndexType<iType>,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
-                 "ExecutionPolicy: Only one IndexType<..> template argument may be used.");
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef iType index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off raw IndexType
-template<typename iType, class ... Props>
-struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
-                 "ExecutionPolicy: Only one IndexType<..> template argument may be used.");
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef iType index_type;
-  typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
-};
-
-//Strip off TagType
-template<class TagType, class ... Props>
-struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value &&
-                                            !is_execution_space<TagType>::value &&
-                                            !is_index_type<TagType>::value &&
-                                            !std::is_integral<TagType>::value 
-                                           >::type,
-                    TagType,Props ...> {
-  static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value,
-                 "ExecutionPolicy: Only one tag type template argument may be used.");
-
-  typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
-  typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
-  typedef typename PolicyTraits<void, Props ...>::index_type index_type;
-  typedef TagType tag_type;
-};
-
-
-template<class ... Props>
-struct PolicyTraits {
-#ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value,
-    Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space;
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value,
-    Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type;
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value,
-    typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type;
-  typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value, 
-    void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag;
-#else
-  typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space;
-  typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type;
-  typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag;
-  typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type;
-  typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type;
-#endif
-};
-
-}
-
-}
-
-namespace Kokkos {
 /** \brief  Execution policy for work over a range of an integral type.
  *
  * Valid template argument options:
@@ -230,7 +78,9 @@ namespace Kokkos {
  *  Blocking is the granularity of partitioning the range among threads.
  */
 template<class ... Properties>
-class RangePolicy: public Impl::PolicyTraits<Properties ... > {
+class RangePolicy
+  : public Impl::PolicyTraits<Properties ... >
+{
 private:
 
   typedef Impl::PolicyTraits<Properties ... > traits;
@@ -243,6 +93,7 @@ private:
 public:
 
   //! Tag this class as an execution policy
+  typedef RangePolicy execution_policy;
   typedef typename traits::index_type member_type ;
 
   KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
@@ -348,7 +199,7 @@ public:
       : m_begin(0), m_end(0)
       {
         if ( part_size ) {
-  
+
           // Split evenly among partitions, then round up to the granularity.
           const member_type work_part =
             ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
@@ -356,7 +207,7 @@ public:
 
           m_begin = range.begin() + work_part * part_rank ;
           m_end   = m_begin       + work_part ;
-  
+
           if ( range.end() < m_begin ) m_begin = range.end() ;
           if ( range.end() < m_end )   m_end   = range.end() ;
         }
@@ -366,10 +217,11 @@ public:
      member_type m_end ;
      WorkRange();
      WorkRange & operator = ( const WorkRange & );
-   
+
   };
 };
 
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -377,38 +229,6 @@ public:
 
 namespace Kokkos {
 
-namespace Experimental {
-
-/** \brief Scratch memory request accepting per team and per thread value
- *
- * An instance of this class can be given as the last argument to a 
- * TeamPolicy constructor. It sets the amount of user requested shared
- * memory for the team.
- */
-
-template< class MemorySpace >
-class TeamScratchRequest {
-  size_t m_per_team;
-  size_t m_per_thread;
-  
-public:
-  TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
-   m_per_team(per_team_), m_per_thread(per_thread_) {
-  } 
-
-  size_t per_team() const {
-    return m_per_team;
-  }
-  size_t per_thread() const {
-    return m_per_thread;
-  }
-  size_t total(const size_t team_size) const {
-    return m_per_team + m_per_thread * team_size;
-  }
-}; 
-
-}
-
 namespace Impl {
 
 
@@ -451,11 +271,9 @@ public:
 
   TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
 
-  template<class MemorySpace>
-  TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
+/*  TeamPolicyInternal( int league_size_request , int team_size_request );
 
-  template<class MemorySpace>
-  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
+  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
 
   /** \brief  The actual league size (number of teams) of the policy.
    *
@@ -574,12 +392,14 @@ class TeamPolicy: public
   typedef Impl::TeamPolicyInternal<
        typename Impl::PolicyTraits<Properties ... >::execution_space,
        Properties ...> internal_policy;
+
   typedef Impl::PolicyTraits<Properties ... > traits;
 
 public:
+  typedef TeamPolicy execution_policy;
 
   TeamPolicy& operator = (const TeamPolicy&) = default;
- 
+
   /** \brief  Construct policy with the given instance of the execution space */
   TeamPolicy( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
     : internal_policy(typename traits::execution_space(),league_size_request,team_size_request, vector_length_request) {}
@@ -594,13 +414,11 @@ public:
   TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
     : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}
 
-  template<class MemorySpace>
-  TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
-    : internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {}
+/*  TeamPolicy( int league_size_request , int team_size_request  )
+    : internal_policy(league_size_request,team_size_request) {}
 
-  template<class MemorySpace>
-  TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request )
-    : internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {}
+  TeamPolicy( int league_size_request , const Kokkos::AUTO_t &  )
+    : internal_policy(league_size_request,Kokkos::AUTO()) {}*/
 
 private:
   TeamPolicy(const internal_policy& p):internal_policy(p) {}
@@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(
 
 } // namespace Kokkos
 
+
 #endif /* #define KOKKOS_EXECPOLICY_HPP */
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index 6bef213b01..e02689b0f9 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -120,21 +120,6 @@ public:
   //! This memory space preferred device_type
   typedef Kokkos::Device<execution_space,memory_space> device_type;
 
-  /*--------------------------------*/
-#if ! KOKKOS_USING_EXP_VIEW
-
-  typedef Impl::HBWMallocAllocator allocator ;
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
   /*--------------------------------*/
   /* Functions unique to the HBWSpace */
   static int in_parallel();
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
index bea955cdd9..5fe686559a 100644
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -55,9 +55,6 @@
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
 
-#include <impl/Kokkos_AllocationTracker.hpp>
-#include <impl/Kokkos_BasicAllocators.hpp>
-
 #include <impl/KokkosExp_SharedAlloc.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -128,25 +125,6 @@ public:
   //! This memory space preferred device_type
   typedef Kokkos::Device<execution_space,memory_space> device_type;
 
-  /*--------------------------------*/
-#if ! KOKKOS_USING_EXP_VIEW
-
-#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
-  typedef Impl::PageAlignedAllocator allocator ;
-#else
-  typedef Impl::AlignedAllocator allocator ;
-#endif
-
-  /** \brief  Allocate a contiguous block of memory.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   */
-  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
   /*--------------------------------*/
   /* Functions unique to the HostSpace */
   static int in_parallel();
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 40a46b3022..7d1e59af5e 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -133,11 +133,23 @@
 // still identifies as 7.0
 #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
 #endif
+#if ( CUDA_VERSION < 8000 )
 #define KOKKOS_LAMBDA [=]__device__
+#else
+#define KOKKOS_LAMBDA [=]__host__ __device__
+#endif
 #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
 #endif
 #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
 
+
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+   // Cuda version 8.0 still needs the functor wrapper
+   #if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
+      #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+   #endif
+#endif
+
 /*--------------------------------------------------------------------------*/
 /* Language info: C++, CUDA, OPENMP */
 
@@ -440,27 +452,16 @@
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-/* Transitional macro to change between old and new View,
- * default to use new View.
+/* Transitional macro to change between old and new View
+ * are no longer supported.
  */
 
-#if ! defined( KOKKOS_USING_EXP_VIEW )
 #if defined( KOKKOS_USING_DEPRECATED_VIEW )
-#define KOKKOS_USING_EXP_VIEW 0
-#else
-#define KOKKOS_USING_EXP_VIEW 1
-#endif
+#error "Kokkos deprecated View has been removed"
 #endif
 
-#if KOKKOS_USING_EXP_VIEW
-#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+#define KOKKOS_USING_EXP_VIEW 1
 #define KOKKOS_USING_EXPERIMENTAL_VIEW
-#endif
-#else /* ! KOKKOS_USING_EXP_VIEW */
-#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
-#error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible"
-#endif
-#endif
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index 72d2a30560..d843f7c9a1 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -44,13 +44,16 @@
 #ifndef KOKKOS_MEMORYPOOL_HPP
 #define KOKKOS_MEMORYPOOL_HPP
 
-#include <vector>
-
 #include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/KokkosExp_SharedAlloc.hpp>
-#include <Kokkos_ExecPolicy.hpp>
-#include <Kokkos_Atomic.hpp>
+
+#include <limits>
+#include <algorithm>
+#include <chrono>
 
 // How should errors be handled?  In general, production code should return a
 // value indicating failure so the user can decide how the error is handled.
@@ -60,516 +63,1431 @@
 //#define KOKKOS_MEMPOOL_PRINTERR
 
 //#define KOKKOS_MEMPOOL_PRINT_INFO
+//#define KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+//#define KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+//#define KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+//#define KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+//#define KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+//#define KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+
+// A superblock is considered full when this percentage of its pages are full.
+#define KOKKOS_MEMPOOL_SB_FULL_FRACTION 0.80
+
+// A page is considered full when this percentage of its blocks are full.
+#define KOKKOS_MEMPOOL_PAGE_FULL_FRACTION 0.875  // 28 / 32
 
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 namespace Experimental {
 
-template < typename Space , typename ExecSpace = typename Space::execution_space >
-class MemoryPool;
+namespace MempoolImpl {
 
-namespace Impl {
+template < typename T, typename ExecutionSpace >
+struct initialize_array {
+  typedef ExecutionSpace                      execution_space;
+  typedef typename ExecutionSpace::size_type  size_type;
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-template < typename MemPool >
-struct print_mempool {
-  size_t      m_num_chunk_sizes;
-  size_t *    m_chunk_size;
-  uint64_t *  m_freelist;
-  char *      m_data;
-
-  print_mempool( size_t ncs, size_t * cs, uint64_t * f, char * d )
-    : m_num_chunk_sizes(ncs), m_chunk_size(cs), m_freelist(f), m_data(d)
-  {}
+  T *  m_data;
+  T    m_value;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_t i ) const
+  initialize_array( T * d, size_t size, T v ) : m_data( d ), m_value( v )
   {
-    if ( i == 0 ) {
-      printf( "*** ON DEVICE ***\n");
-      printf( "m_chunk_size: 0x%llx\n", reinterpret_cast<uint64_t>( m_chunk_size ) );
-      printf( "  m_freelist: 0x%llx\n", reinterpret_cast<uint64_t>( m_freelist ) );
-      printf( "      m_data: 0x%llx\n", reinterpret_cast<uint64_t>( m_data ) );
-      for ( size_t l = 0; l < m_num_chunk_sizes; ++l ) {
-        printf( "%2lu    freelist: %10llu    chunk_size: %6lu\n",
-               l, get_head_offset( m_freelist[l] ), m_chunk_size[l] );
-      }
-      printf( "                              chunk_size: %6lu\n\n",
-              m_chunk_size[m_num_chunk_sizes] );
-    }
+    Kokkos::parallel_for( size, *this );
+
+    execution_space::fence();
   }
 
-  // This is only redefined here to avoid having to pass a MemPoolList object
-  // to the class.
   KOKKOS_INLINE_FUNCTION
-  uint64_t get_head_offset(uint64_t head) const
-  { return ( head >> MemPool::TAGBITS ) << MemPool::LG_MIN_CHUNKSIZE; }
+  void operator()( size_type i ) const { m_data[i] = m_value; }
 };
-#endif
 
-template < typename MemPool >
-struct initialize_mempool {
-  char *  m_data;
-  size_t  m_chunk_size;
-  size_t  m_last_chunk;
-  size_t  m_base_offset;
+template <typename Bitset>
+struct bitset_count
+{
+  typedef typename Bitset::execution_space     execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef typename Bitset::size_type           value_type;
+  typedef typename Bitset::word_type           word_type;
+
+  word_type *   m_words;
+  value_type &  m_result;
+
+  bitset_count( word_type * w, value_type num_words, value_type & r )
+    : m_words( w ), m_result( r )
+  {
+    parallel_reduce( num_words, *this, m_result );
+  }
 
-  initialize_mempool( char * d, size_t cs, size_t lc, size_t bo )
-    : m_data(d), m_chunk_size(cs), m_last_chunk(lc), m_base_offset(bo)
-  {}
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  { v = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_t i ) const
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & count) const
   {
-    uint64_t * lp =
-      reinterpret_cast<uint64_t *>( m_data + m_base_offset + i * m_chunk_size );
-
-    // All entries in the list point to the next entry except the last which
-    // uses a reserved value to indicate the end of the list.  The offset from
-    // the base pointer is stored in increments of the minimum chunk size.
-    *lp = i < m_last_chunk ?
-          m_base_offset + (i + 1) * m_chunk_size :
-          MemPool::FREELIST_END;
+    count += Kokkos::Impl::bit_count( m_words[i] );
   }
 };
 
-class MemPoolList {
-private:
-
-  typedef Impl::SharedAllocationTracker  Tracker;
+template < typename Device >
+class Bitset {
+public:
+  typedef typename Device::execution_space  execution_space;
+  typedef typename Device::memory_space     memory_space;
+  typedef unsigned                          word_type;
+  typedef unsigned                          size_type;
 
-  template < typename , typename > friend class Kokkos::Experimental::MemoryPool;
-  template < typename > friend struct initialize_mempool;
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-  template < typename > friend struct print_mempool;
-#endif
+  typedef Kokkos::Impl::DeepCopy< memory_space, Kokkos::HostSpace > raw_deep_copy;
 
   // Define some constants.
   enum {
-    // The head of a freelist is a 64 bit unsigned interger.  We divide it
-    // into 2 pieces.  The upper (64-TAGBITS) bits is the offset from the base
-    // data pointer of the allocator in increments of the minimum chunk size.
-    // The lower TAGBITS bits is the tag used to prevent ABA problems.  The
-    // largest two values that fit in the offset portion are reserved to
-    // represent the end of the freelist and that the freelist is locked.
-    //
-    // Using 32 bits for both the tag and offset and with a minimum chunk size
-    // of 128 bytes, the offset can address 549755813632 bytes (app. 512 GB)
-    // of memory.  This should be more than enough to address the whole address
-    // space of a GPU or MIC for the foreseeable future.
-    TAGBITS            = 32,
-    MIN_CHUNKSIZE      = 128,
-
-    TAGBITS_MASK       = ( uint64_t( 1 ) << TAGBITS ) - 1,
-    LG_MIN_CHUNKSIZE   = Kokkos::Impl::integral_power_of_two(MIN_CHUNKSIZE),
-
-    // The largest two values of the offset are reserved to indicate the end of a
-    // freelist (2^TAGBITS - 2) and that the freelist is locked (2^TAGBITS - 1).
-    // They are shifted so they can be compared directly to the result of
-    // get_head_offset().
-    FREELIST_END       = uint64_t( TAGBITS_MASK - 1 ) << LG_MIN_CHUNKSIZE,
-    FREELIST_LOCK      = uint64_t( TAGBITS_MASK ) << LG_MIN_CHUNKSIZE,
-
-    // This is the head value for a locked freelist.  It uses the lock value for
-    // the offset and 0 for the tagbits.
-    FREELIST_LOCK_HEAD = uint64_t( TAGBITS_MASK ) << TAGBITS
+    // Size of bitset word.  Should be 32.
+    WORD_SIZE    = sizeof(word_type) * CHAR_BIT,
+    LG_WORD_SIZE = Kokkos::Impl::integral_power_of_two( WORD_SIZE ),
+    WORD_MASK    = WORD_SIZE - 1
   };
 
-  Tracker   m_track;
+private:
+  word_type *  m_words;
+  size_type    m_size;
+  size_type    m_num_words;
+  word_type    m_last_word_mask;
 
-  // These three variables are pointers into device memory.
-  size_t *    m_chunk_size; // Array of chunk sizes of freelists.
-  uint64_t *  m_freelist;   // Array of freelist heads.
-  char *      m_data;       // Beginning memory location used for chunks.
+public:
+  ~Bitset() = default;
+  Bitset() = default;
+  Bitset( Bitset && ) = default;
+  Bitset( const Bitset & ) = default;
+  Bitset & operator = ( Bitset && ) = default;
+  Bitset & operator = ( const Bitset & ) = default;
+
+  void init( void * w, size_type s )
+  {
+    // Assumption: The size of the memory pointed to by w is a multiple of
+    //             sizeof(word_type).
 
-  size_t      m_data_size;
-  size_t      m_chunk_spacing;
+    m_words = reinterpret_cast<word_type*>( w );
+    m_size = s;
+    m_num_words = ( s + WORD_SIZE - 1 ) >> LG_WORD_SIZE;
+    m_last_word_mask = m_size & WORD_MASK ? ( word_type(1) << ( m_size & WORD_MASK ) ) - 1 : 0;
 
-#if defined(KOKKOS_MEMPOOL_PRINT_INFO) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-  static long m_count;
-#endif
+    reset();
+  }
 
-  ~MemPoolList() = default;
-  MemPoolList() = default;
-  MemPoolList( MemPoolList && ) = default;
-  MemPoolList( const MemPoolList & ) = default;
-  MemPoolList & operator = ( MemPoolList && ) = default;
-  MemPoolList & operator = ( const MemPoolList & ) = default;
+  size_type size() const { return m_size; }
 
-  template < typename MemorySpace, typename ExecutionSpace >
-  inline
-  MemPoolList( const MemorySpace & memspace, const ExecutionSpace &,
-               size_t arg_base_chunk_size, size_t arg_total_size,
-               size_t num_chunk_sizes, size_t chunk_spacing )
-    : m_track(), m_chunk_size(0), m_freelist(0), m_data(0), m_data_size(0),
-      m_chunk_spacing(chunk_spacing)
+  size_type count() const
   {
-    static_assert( sizeof(size_t) <= sizeof(void*), "" );
+    size_type val;
+    bitset_count< Bitset > bc( m_words, m_num_words, val );
+    return val;
+  }
 
-    typedef Impl::SharedAllocationRecord< MemorySpace, void >  SharedRecord;
-    typedef Kokkos::RangePolicy< ExecutionSpace >              Range;
+  void set()
+  {
+    // Set all the bits.
+    initialize_array< word_type, execution_space > ia( m_words, m_num_words, ~word_type(0) );
 
-    size_t base_chunk_size = arg_base_chunk_size;
+    if ( m_last_word_mask ) {
+      // Clear the unused bits in the last block.
+      raw_deep_copy( m_words + ( m_num_words - 1 ), &m_last_word_mask, sizeof(word_type) );
+    }
+  }
 
-    // The base chunk size must be at least MIN_CHUNKSIZE bytes as this is the
-    // cache-line size for NVIDA GPUs.
-    if ( base_chunk_size < MIN_CHUNKSIZE ) {
+  void reset()
+  {
+    initialize_array< word_type, execution_space > ia( m_words, m_num_words, word_type(0) );
+  }
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-      printf( "** Chunk size must be at least %u bytes.  Setting to %u. **\n",
-              MIN_CHUNKSIZE, MIN_CHUNKSIZE);
-      fflush( stdout );
-#endif
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    return word & mask;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    return !( atomic_fetch_or( &m_words[ word_pos ], mask ) & mask );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool reset( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    return atomic_fetch_and( &m_words[ word_pos ], ~mask ) & mask;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, word_type >
+  fetch_word_reset( size_type i ) const
+  {
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type mask = word_type(1) << ( i & WORD_MASK );
+
+    Kokkos::pair<bool, word_type> result;
+    result.second = atomic_fetch_and( &m_words[ word_pos ], ~mask );
+    result.first = result.second & mask;
+
+    return result;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  set_any_in_word( size_type i, word_type & prev_val ) const
+  {
+    prev_val = 0;
 
-      base_chunk_size = MIN_CHUNKSIZE;
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+
+    // Loop until there are no more unset bits in the word.
+    while ( ~word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( ~word );
+
+      // Try to set the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_or( &m_words[ word_pos ], mask );
+
+      if ( !( word & mask ) ) {
+        // Successfully set the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
     }
 
-    // The base chunk size must also be a multiple of MIN_CHUNKSIZE bytes for
-    // correct memory alignment of the chunks.  If it isn't a multiple of
-    // MIN_CHUNKSIZE, set it to the smallest multiple of MIN_CHUNKSIZE
-    // greater than the given chunk size.
-    if ( base_chunk_size % MIN_CHUNKSIZE != 0 ) {
-      size_t old_chunk_size = base_chunk_size;
-      base_chunk_size = ( ( old_chunk_size + MIN_CHUNKSIZE - 1 ) / MIN_CHUNKSIZE ) *
-                        MIN_CHUNKSIZE;
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-      printf( "** Chunk size must be a multiple of %u bytes.  Given: %lu  Using: %lu. **\n",
-              MIN_CHUNKSIZE, old_chunk_size, base_chunk_size);
-      fflush( stdout );
-#endif
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  set_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+    word = ( ~word ) & word_mask;
 
+    // Loop until there are no more unset bits in the word.
+    while ( word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( word );
+
+      // Try to set the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_or( &m_words[ word_pos ], mask );
+
+      if ( !( word & mask ) ) {
+        // Successfully set the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
+
+      word = ( ~word ) & word_mask;
     }
 
-    // Force total_size to be a multiple of base_chunk_size.
-    // Preserve the number of chunks originally requested.
-    size_t total_size = base_chunk_size *
-      ( ( arg_total_size + arg_base_chunk_size - 1 ) / arg_base_chunk_size );
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  reset_any_in_word( size_type i, word_type & prev_val ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
 
-    m_data_size = total_size;
+    // Loop until there are no more set bits in the word.
+    while ( word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( word );
 
-    // Get the chunk size for the largest possible chunk.
-    //   max_chunk_size =
-    //     base_chunk_size * (m_chunk_spacing ^ (num_chunk_sizes - 1))
-    size_t max_chunk_size = base_chunk_size;
-    for (size_t i = 1; i < num_chunk_sizes; ++i) {
-      max_chunk_size *= m_chunk_spacing;
+      // Try to reset the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
+
+      if ( word & mask ) {
+        // Successfully reset the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
     }
 
-    // We want each chunk size to use total_size / num_chunk_sizes memory.  If
-    // the total size of the pool is not enough to accomodate this, keep making
-    // the next lower chunk size the max_chunk_size until it is.
-    while ( max_chunk_size > total_size / num_chunk_sizes ) {
-      max_chunk_size /= m_chunk_spacing;
-      --num_chunk_sizes;
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair< bool, size_type >
+  reset_any_in_word( size_type i, word_type & prev_val, word_type word_mask ) const
+  {
+    prev_val = 0;
+
+    size_type word_pos = i >> LG_WORD_SIZE;
+    word_type word = volatile_load( &m_words[ word_pos ] );
+    word = word & word_mask;
+
+    // Loop until there are no more set bits in the word.
+    while ( word ) {
+      // Find the first unset bit in the word.
+      size_type bit = Kokkos::Impl::bit_scan_forward( word );
+
+      // Try to reset the bit.
+			word_type mask = word_type(1) << bit;
+      word = atomic_fetch_and( &m_words[ word_pos ], ~mask );
+
+      if ( word & mask ) {
+        // Successfully reset the bit.
+        prev_val = word;
+
+        return Kokkos::pair<bool, size_type>( true, ( word_pos << LG_WORD_SIZE ) + bit );
+      }
+
+      word = word & word_mask;
     }
 
-    // We put a header at the beginnig of the device memory and use extra
-    // chunks to store the header.  The header contains:
-    //   size_t     chunk_size[num_chunk_sizes+1]
-    //   uint64_t  freelist[num_chunk_sizes]
+    // Didn't find a free bit in this word.
+    return Kokkos::pair<bool, size_type>( false, i );
+  }
+};
 
-    // Calculate the size of the header where the size is rounded up to the
-    // smallest multiple of base_chunk_size >= the needed size.  The size of the
-    // chunk size array is calculated using sizeof(void*) to guarantee alignment
-    // for the freelist array.  This assumes sizeof(size_t) <= sizeof(void*).
-    size_t header_bytes = ( 2 * num_chunk_sizes + 1 ) * sizeof(void*);
-    size_t header_size =
-      ( header_bytes + base_chunk_size - 1 ) / base_chunk_size * base_chunk_size;
+template < typename UInt32View, typename BSHeaderView, typename SBHeaderView,
+           typename MempoolBitset >
+struct create_histogram {
+  typedef typename UInt32View::execution_space  execution_space;
+  typedef typename execution_space::size_type   size_type;
+  typedef Kokkos::pair< double, uint32_t >      value_type;
+
+  size_t         m_start;
+  UInt32View     m_page_histogram;
+  BSHeaderView   m_blocksize_info;
+  SBHeaderView   m_sb_header;
+  MempoolBitset  m_sb_blocks;
+  size_t         m_lg_max_sb_blocks;
+  uint32_t       m_lg_min_block_size;
+  uint32_t       m_blocks_per_page;
+  value_type &   m_result;
+
+  create_histogram( size_t start, size_t end, UInt32View ph, BSHeaderView bsi,
+                    SBHeaderView sbh, MempoolBitset sbb, size_t lmsb,
+                    uint32_t lmbs, uint32_t bpp, value_type & r )
+    : m_start( start ), m_page_histogram( ph ), m_blocksize_info( bsi ),
+      m_sb_header( sbh ), m_sb_blocks( sbb ), m_lg_max_sb_blocks( lmsb ),
+      m_lg_min_block_size( lmbs ), m_blocks_per_page( bpp ), m_result( r )
+  {
+    Kokkos::parallel_reduce( end - start, *this, m_result );
 
-    // Allocate the memory including the header.
-    size_t alloc_size = total_size + header_size;
+    execution_space::fence();
+  }
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-      printf( "** Allocating total %ld bytes\n", long(alloc_size));
-      fflush( stdout );
-#endif
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  {
+    v.first  = 0.0;
+    v.second = 0;
+  }
 
-    SharedRecord * rec =
-      SharedRecord::allocate( memspace, "mempool", alloc_size );
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  {
+    dst.first += src.first;
+    dst.second += src.second;
+  }
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-      printf( "** Allocated total %ld bytes at 0x%lx\n",
-              long(alloc_size), long(rec->data()) );
-      fflush( stdout );
-#endif
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & r ) const
+  {
+    size_type i2 = i + m_start;
 
-    m_track.assign_allocated_record_to_uninitialized( rec );
+    uint32_t lg_block_size = m_sb_header(i2).m_lg_block_size;
 
-    {
-      // Get the pointers into the allocated memory.
-      char * mem = reinterpret_cast<char *>( rec->data() );
-      m_chunk_size = reinterpret_cast<size_t *>( mem );
-      m_freelist = reinterpret_cast<uint64_t *>(
-                   mem + ( num_chunk_sizes + 1 ) * sizeof(void*) );
-      m_data = mem + header_size;
+    // A superblock only has a block size of 0 when it is empty.
+    if ( lg_block_size != 0 ) {
+      uint32_t block_size_id = lg_block_size - m_lg_min_block_size;
+      uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
+      uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-      printf( "** Partitioning allocation 0x%lx : m_chunk_size[0x%lx] m_freelist[0x%lx] m_data[0x%lx]\n",
-              (unsigned long) mem, (unsigned long) m_chunk_size,
-              (unsigned long) m_freelist, (unsigned long) m_data );
-      fflush( stdout );
-#endif
+      uint32_t total_allocated_blocks = 0;
+
+      for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
+        unsigned start_pos = ( i2 << m_lg_max_sb_blocks ) + j * m_blocks_per_page;
+        unsigned end_pos = start_pos + m_blocks_per_page;
+        uint32_t page_allocated_blocks = 0;
+
+        for ( unsigned k = start_pos; k < end_pos; ++k ) {
+          page_allocated_blocks += m_sb_blocks.test( k );
+        }
+
+        total_allocated_blocks += page_allocated_blocks;
+
+        atomic_fetch_add( &m_page_histogram(page_allocated_blocks), 1 );
+      }
+
+      r.first += double(total_allocated_blocks) / blocks_per_sb;
+      r.second += blocks_per_sb;
     }
+  }
+};
+
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+template < typename UInt32View, typename SBHeaderView, typename MempoolBitset >
+struct count_allocated_blocks {
+  typedef typename UInt32View::execution_space  execution_space;
+  typedef typename execution_space::size_type   size_type;
+
+  UInt32View     m_num_allocated_blocks;
+  SBHeaderView   m_sb_header;
+  MempoolBitset  m_sb_blocks;
+  size_t         m_sb_size;
+  size_t         m_lg_max_sb_blocks;
+
+  count_allocated_blocks( size_t num_sb, UInt32View nab, SBHeaderView sbh,
+                          MempoolBitset sbb, size_t sbs, size_t lmsb )
+    : m_num_allocated_blocks( nab ), m_sb_header( sbh ),
+      m_sb_blocks( sbb ), m_sb_size( sbs ), m_lg_max_sb_blocks( lmsb )
+  {
+    Kokkos::parallel_for( num_sb, *this );
+
+    execution_space::fence();
+  }
 
-    // Initialize the chunk sizes array.  Create num_chunk_sizes different
-    // chunk sizes where each successive chunk size is
-    // m_chunk_spacing * previous chunk size.  The last entry in the array is
-    // 0 and is used for a stopping condition.
-    m_chunk_size[0] = base_chunk_size;
-    for ( size_t i = 1; i < num_chunk_sizes; ++i ) {
-      m_chunk_size[i] = m_chunk_size[i - 1] * m_chunk_spacing;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
+
+    // A superblock only has a block size of 0 when it is empty.
+    if ( lg_block_size != 0 ) {
+      // Count the allocated blocks in the superblock.
+      uint32_t blocks_per_sb = lg_block_size > 0 ? m_sb_size >> lg_block_size : 0;
+      unsigned start_pos = i << m_lg_max_sb_blocks;
+      unsigned end_pos = start_pos + blocks_per_sb;
+      uint32_t count = 0;
+
+      for ( unsigned j = start_pos; j < end_pos; ++j ) {
+        count += m_sb_blocks.test( j );
+      }
+
+      m_num_allocated_blocks(i) = count;
     }
-    m_chunk_size[num_chunk_sizes] = 0;
+  }
+};
+#endif
 
-    std::vector<size_t> num_chunks(num_chunk_sizes);
+}
 
-    // Set the starting point in memory and get the number of chunks for each
-    // freelist.  Start with the largest chunk size to ensure usage of all the
-    // memory.  If there is leftover memory for a chunk size, it will be used
-    // by a smaller chunk size.
-    size_t used_memory = 0;
-    for ( size_t i = num_chunk_sizes; i > 0; --i ) {
-      // Set the starting position in the memory for the current chunk sizes's
-      // freelist and initialize the tag to 0.
-      m_freelist[i - 1] = create_head( used_memory, 0UL );
+/// \class MemoryPool
+/// \brief Bitset based memory manager for pools of same-sized chunks of memory.
+/// \tparam Device Kokkos device that gives the execution and memory space the
+///                allocator will be used in.
+///
+/// MemoryPool is a memory space that can be on host or device.  It provides a
+/// pool memory allocator for fast allocation of same-sized chunks of memory.
+/// The memory is only accessible on the host / device this allocator is
+/// associated with.
+///
+/// This allocator is based on ideas from the following GPU allocators:
+///   Halloc (https://github.com/canonizer/halloc).
+///   ScatterAlloc (https://github.com/ComputationalRadiationPhysics/scatteralloc)
+template < typename Device >
+class MemoryPool {
+private:
+  // The allocator uses superblocks.  A superblock is divided into pages, and a
+  // page is divided into blocks.  A block is the chunk of memory that is given
+  // out by the allocator.  A page always has a number of blocks equal to the
+  // size of the word used by the bitset.  Thus, the pagesize can vary between
+  // superblocks as it is based on the block size of the superblock.  The
+  // allocator supports all powers of 2 from MIN_BLOCK_SIZE to the size of a
+  // superblock as block sizes.
+
+  // Superblocks are divided into 4 categories:
+  //   1. empty    - is completely empty; there are no active allocations
+  //   2. partfull - partially full; there are some active allocations
+  //   3. full     - full enough with active allocations that new allocations
+  //                 will likely fail
+  //   4. active   - is currently the active superblock for a block size
+  //
+  // An inactive superblock is one that is empty, partfull, or full.
+  //
+  // New allocations occur only from an active superblock.  If a superblock is
+  // made inactive after an allocation request is made to it but before the
+  // allocation request is fulfilled, the allocation will still be attempted
+  // from that superblock.  Deallocations can  occur to partfull, full, or
+  // active superblocks.  Superblocks move between categories as allocations
+  // and deallocations happen.  Superblocks all start empty.
+  //
+  // Here are the possible moves between categories:
+  //   empty    -> active    During allocation, there is no active superblock
+  //                         or the active superblock is full.
+  //   active   -> full      During allocation, the full threshold of the
+  //                         superblock is reached when increasing the fill
+  //                         level.
+  //   full     -> partfull  During deallocation, the full threshold of the
+  //                         superblock is crossed when decreasing the fill
+  //                         level.
+  //   partfull -> empty     Deallocation of the last allocated block of an
+  //                         inactive superblock.
+  //   partfull -> active    During allocation, the active superblock is full.
+  //
+  // When a new active superblock is needed, partfull superblocks of the same
+  // block size are chosen over empty superblocks.
+  //
+  // The empty and partfull superblocks are tracked using bitsets that represent
+  // the superblocks in those repsective categories.  Empty superblocks use a
+  // single bitset, while partfull superblocks use a bitset per block size
+  // (contained sequentially in a single bitset).  Active superblocks are
+  // tracked by the active superblocks array.  Full superblocks aren't tracked
+  // at all.
+
+  typedef typename Device::execution_space    execution_space;
+  typedef typename Device::memory_space       backend_memory_space;
+  typedef Device                              device_type;
+  typedef MempoolImpl::Bitset< device_type >  MempoolBitset;
 
-      size_t mem_avail =
-        total_size - (i - 1) * ( total_size / num_chunk_sizes ) - used_memory;
+  // Define some constants.
+  enum {
+    MIN_BLOCK_SIZE     = 64,
+    LG_MIN_BLOCK_SIZE  = Kokkos::Impl::integral_power_of_two( MIN_BLOCK_SIZE ),
+    MAX_BLOCK_SIZES    = 31 - LG_MIN_BLOCK_SIZE + 1,
 
-      // Set the number of chunks for the current chunk sizes's freelist.
-      num_chunks[i - 1] = mem_avail / m_chunk_size[i - 1];
+    // Size of bitset word.
+    BLOCKS_PER_PAGE    = MempoolBitset::WORD_SIZE,
+    LG_BLOCKS_PER_PAGE = MempoolBitset::LG_WORD_SIZE,
 
-      used_memory += num_chunks[i - 1] * m_chunk_size[i - 1];
-    }
+    INVALID_SUPERBLOCK = ~uint32_t(0),
+    SUPERBLOCK_LOCK    = ~uint32_t(0) - 1,
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-    printf( "\n" );
-    printf( "*** ON HOST ***\n");
-    printf( "m_chunk_size: 0x%llx\n", reinterpret_cast<uint64_t>( m_chunk_size ) );
-    printf( "  m_freelist: 0x%llx\n", reinterpret_cast<uint64_t>( m_freelist ) );
-    printf( "      m_data: 0x%llx\n", reinterpret_cast<uint64_t>( m_data ) );
-    for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
-      printf( "%2lu    freelist: %10llu    chunk_size: %6lu    num_chunks: %8lu\n",
-              i, get_head_offset( m_freelist[i] ), m_chunk_size[i], num_chunks[i] );
-    }
-    printf( "                              chunk_size: %6lu\n\n",
-            m_chunk_size[num_chunk_sizes] );
-    fflush( stdout );
+    MAX_TRIES          = 32             // Cap on the number of pages searched
+                                        // before an allocation returns empty.
+  };
+
+public:
+  // Stores information about each superblock.
+  struct SuperblockHeader {
+    uint32_t  m_full_pages;
+    uint32_t  m_empty_pages;
+    uint32_t  m_lg_block_size;
+    uint32_t  m_is_active;
+
+    KOKKOS_FUNCTION
+    SuperblockHeader() :
+      m_full_pages(0), m_empty_pages(0), m_lg_block_size(0), m_is_active(false) {}
+  };
+
+  // Stores information about each block size.
+  struct BlockSizeHeader {
+    uint32_t  m_blocks_per_sb;
+    uint32_t  m_pages_per_sb;
+    uint32_t  m_sb_full_level;
+    uint32_t  m_page_full_level;
+
+    KOKKOS_FUNCTION
+    BlockSizeHeader() :
+      m_blocks_per_sb(0), m_pages_per_sb(0), m_sb_full_level(0), m_page_full_level(0) {}
+  };
+
+private:
+  typedef Impl::SharedAllocationTracker            Tracker;
+  typedef View< uint32_t *, device_type >          UInt32View;
+  typedef View< SuperblockHeader *, device_type >  SBHeaderView;
+
+  // The letters 'sb' used in any variable name mean superblock.
+
+  size_t           m_lg_sb_size;        // Log2 of superblock size.
+  size_t           m_sb_size;           // Superblock size.
+  size_t           m_lg_max_sb_blocks;  // Log2 of the number of blocks of the
+                                        // minimum block size in a superblock.
+  size_t           m_num_sb;            // Number of superblocks.
+  size_t           m_ceil_num_sb;       // Number of superblocks rounded up to the smallest
+                                        // multiple of the bitset word size.  Used by
+                                        // bitsets representing superblock categories to
+                                        // ensure different block sizes never share a word
+                                        // in the bitset.
+  size_t           m_num_block_size;    // Number of block sizes supported.
+  size_t           m_data_size;         // Amount of memory available to the allocator.
+  size_t           m_sb_blocks_size;    // Amount of memory for free / empty blocks bitset.
+  size_t           m_empty_sb_size;     // Amount of memory for empty superblocks bitset.
+  size_t           m_partfull_sb_size;  // Amount of memory for partfull superblocks bitset.
+  size_t           m_total_size;        // Total amount of memory allocated.
+  char *           m_data;              // Beginning device memory location used for
+                                        // superblocks.
+  UInt32View       m_active;            // Active superblocks IDs.
+  SBHeaderView     m_sb_header;         // Header info for superblocks.
+  MempoolBitset    m_sb_blocks;         // Bitsets representing free / allocated status
+                                        // of blocks in superblocks.
+  MempoolBitset    m_empty_sb;          // Bitset representing empty superblocks.
+  MempoolBitset    m_partfull_sb;       // Bitsets representing partially full superblocks.
+  Tracker          m_track;             // Tracker for superblock memory.
+  BlockSizeHeader  m_blocksize_info[MAX_BLOCK_SIZES];  // Header info for block sizes.
+
+  // There were several methods tried for storing the block size header info: in a View,
+  // in a View of const data, and in a RandomAccess View.  All of these were slower than
+  // storing it in a static array that is a member variable to the class.  In the latter
+  // case, the block size info gets copied into the constant memory on the GPU along with
+  // the class when it is copied there for exeucting a parallel loop.  Instead of storing
+  // the values, computing the values every time they were needed was also tried.  This
+  // method was slightly slower than storing them in the static array.
+
+public:
+  //! Tag this class as a kokkos memory space
+  typedef MemoryPool  memory_space;
+
+  ~MemoryPool() = default;
+  MemoryPool() = default;
+  MemoryPool( MemoryPool && ) = default;
+  MemoryPool( const MemoryPool & ) = default;
+  MemoryPool & operator = ( MemoryPool && ) = default;
+  MemoryPool & operator = ( const MemoryPool & ) = default;
+
+  /// \brief Initializes the memory pool.
+  /// \param memspace The memory space from which the memory pool will allocate memory.
+  /// \param total_size The requested memory amount controlled by the allocator.  The
+  ///                   actual amount is rounded up to the smallest multiple of the
+  ///                   superblock size >= the requested size.
+  /// \param log2_superblock_size Log2 of the size of superblocks used by the allocator.
+  ///                             In most use cases, the default value should work.
+  inline
+  MemoryPool( const backend_memory_space & memspace,
+              size_t total_size, size_t log2_superblock_size = 20 )
+    : m_lg_sb_size( log2_superblock_size ),
+      m_sb_size( size_t(1) << m_lg_sb_size ),
+      m_lg_max_sb_blocks( m_lg_sb_size - LG_MIN_BLOCK_SIZE ),
+      m_num_sb( ( total_size + m_sb_size - 1 ) >> m_lg_sb_size ),
+      m_ceil_num_sb( ( ( m_num_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE ) <<
+                     LG_BLOCKS_PER_PAGE ),
+      m_num_block_size( m_lg_sb_size - LG_MIN_BLOCK_SIZE + 1 ),
+      m_data_size( m_num_sb * m_sb_size ),
+      m_sb_blocks_size( ( m_num_sb << m_lg_max_sb_blocks ) / CHAR_BIT ),
+      m_empty_sb_size( m_ceil_num_sb / CHAR_BIT ),
+      m_partfull_sb_size( m_ceil_num_sb * m_num_block_size / CHAR_BIT ),
+      m_total_size( m_data_size +  m_sb_blocks_size + m_empty_sb_size + m_partfull_sb_size ),
+      m_data(0),
+      m_active( "Active superblocks" ),
+      m_sb_header( "Superblock headers" ),
+      m_track()
+  {
+    // Assumption.  The minimum block size must be a power of 2.
+    static_assert( Kokkos::Impl::is_integral_power_of_two( MIN_BLOCK_SIZE ), "" );
+
+    // Assumption.  Require a superblock be large enough so it takes at least 1
+    // whole bitset word to represent it using the minimum blocksize.
+    if ( m_sb_size < MIN_BLOCK_SIZE * BLOCKS_PER_PAGE ) {
+      printf( "\n** MemoryPool::MemoryPool() Superblock size must be >= %u **\n",
+              MIN_BLOCK_SIZE * BLOCKS_PER_PAGE );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
 #endif
+      Kokkos::abort( "" );
+    }
 
-#ifdef KOKKOS_MEMPOOL_PRINTERR
-    if ( used_memory != total_size ) {
-      printf( "\n** MemoryPool::MemoryPool() USED_MEMORY(%lu) != TOTAL_SIZE(%lu) **\n",
-              used_memory, total_size );
+    // Assumption.  A superblock's size can be at most 2^31.  Verify this.
+    if ( m_lg_sb_size > 31 ) {
+      printf( "\n** MemoryPool::MemoryPool() Superblock size must be < %u **\n",
+              ( uint32_t(1) << 31 ) );
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
       fflush( stdout );
 #endif
       Kokkos::abort( "" );
     }
+
+    // Assumption.  The Bitset only uses unsigned for size types which limits
+    // the amount of memory the allocator can manage.  Verify the memory size
+    // is below this limit.
+    if ( m_data_size > size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max() ) {
+      printf( "\n** MemoryPool::MemoryPool() Allocator can only manage %lu bytes of memory; requested %lu **\n",
+              size_t(MIN_BLOCK_SIZE) * std::numeric_limits<unsigned>::max(), total_size );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
 #endif
+      Kokkos::abort( "" );
+    }
+
+    // Allocate memory for Views.  This is done here instead of at construction
+    // so that the runtime checks can be performed before allocating memory.
+    resize(m_active, m_num_block_size );
+    resize(m_sb_header, m_num_sb );
 
-    // Create the chunks for each freelist.
-    for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
-      // Initialize the next pointers to point to the next chunk for all but the
-      // last chunk which uses a reserved value to indicate the end of the list.
-      initialize_mempool<MemPoolList> im( m_data, m_chunk_size[i], num_chunks[i] - 1,
-                                          get_head_offset( m_freelist[i] ) );
+    // Allocate superblock memory.
+    typedef Impl::SharedAllocationRecord< backend_memory_space, void >  SharedRecord;
+    SharedRecord * rec =
+      SharedRecord::allocate( memspace, "mempool", m_total_size );
 
-      Kokkos::Impl::ParallelFor< initialize_mempool<MemPoolList>, Range >
-        closure( im, Range( 0, num_chunks[i] ) );
+    m_track.assign_allocated_record_to_uninitialized( rec );
+    m_data = reinterpret_cast<char *>( rec->data() );
 
-      closure.execute();
+    // Set and initialize the free / empty block bitset memory.
+    m_sb_blocks.init( m_data + m_data_size, m_num_sb << m_lg_max_sb_blocks );
 
-      ExecutionSpace::fence();
-    }
+    // Set and initialize the empty superblock block bitset memory.
+    m_empty_sb.init( m_data + m_data_size + m_sb_blocks_size, m_num_sb );
 
-#ifdef KOKKOS_MEMPOOL_PRINT_INFO
-    print_mempool<MemPoolList> pm( num_chunk_sizes, m_chunk_size, m_freelist, m_data );
+    // Start with all superblocks in the empty category.
+    m_empty_sb.set();
+
+    // Set and initialize the partfull superblock block bitset memory.
+    m_partfull_sb.init( m_data + m_data_size + m_sb_blocks_size + m_empty_sb_size,
+                        m_ceil_num_sb * m_num_block_size );
+
+    // Initialize all active superblocks to be invalid.
+    typename UInt32View::HostMirror host_active = create_mirror_view(m_active);
+    for (size_t i = 0; i < m_num_block_size; ++i) host_active(i) = INVALID_SUPERBLOCK;
+
+    deep_copy(m_active, host_active);
+
+    // Initialize the blocksize info.
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      uint32_t lg_block_size = i + LG_MIN_BLOCK_SIZE;
+      uint32_t blocks_per_sb = m_sb_size >> lg_block_size;
+      uint32_t pages_per_sb = ( blocks_per_sb + BLOCKS_PER_PAGE - 1 ) >> LG_BLOCKS_PER_PAGE;
+
+      m_blocksize_info[i].m_blocks_per_sb = blocks_per_sb;
+      m_blocksize_info[i].m_pages_per_sb = pages_per_sb;
+
+      // Set the full level for the superblock.
+      m_blocksize_info[i].m_sb_full_level =
+        static_cast<uint32_t>( pages_per_sb * KOKKOS_MEMPOOL_SB_FULL_FRACTION );
+
+      if ( m_blocksize_info[i].m_sb_full_level == 0 ) {
+        m_blocksize_info[i].m_sb_full_level = 1;
+      }
 
-    Kokkos::Impl::ParallelFor< print_mempool<MemPoolList>, Range >
-      closure( pm, Range( 0, 10 ) );
+      // Set the full level for the page.
+      uint32_t blocks_per_page =
+        blocks_per_sb < BLOCKS_PER_PAGE ? blocks_per_sb : BLOCKS_PER_PAGE;
 
-    closure.execute();
+      m_blocksize_info[i].m_page_full_level =
+        static_cast<uint32_t>( blocks_per_page * KOKKOS_MEMPOOL_PAGE_FULL_FRACTION );
 
-    ExecutionSpace::fence();
+      if ( m_blocksize_info[i].m_page_full_level == 0 ) {
+        m_blocksize_info[i].m_page_full_level = 1;
+      }
+    }
+
+#ifdef KOKKOS_MEMPOOL_PRINT_CONSTRUCTOR_INFO
+    printf( "\n" );
+    printf( "      m_lg_sb_size: %12lu\n", m_lg_sb_size );
+    printf( "         m_sb_size: %12lu\n", m_sb_size );
+    printf( "   m_max_sb_blocks: %12lu\n", size_t(1) << m_lg_max_sb_blocks );
+    printf( "m_lg_max_sb_blocks: %12lu\n", m_lg_max_sb_blocks );
+    printf( "          m_num_sb: %12lu\n", m_num_sb );
+    printf( "     m_ceil_num_sb: %12lu\n", m_ceil_num_sb );
+    printf( "  m_num_block_size: %12lu\n", m_num_block_size );
+    printf( "        data bytes: %12lu\n", m_data_size );
+    printf( "   sb_blocks bytes: %12lu\n", m_sb_blocks_size );
+    printf( "    empty_sb bytes: %12lu\n", m_empty_sb_size );
+    printf( " partfull_sb bytes: %12lu\n", m_partfull_sb_size );
+    printf( "       total bytes: %12lu\n", m_total_size );
+    printf( "   m_empty_sb size: %12u\n", m_empty_sb.size() );
+    printf( "m_partfull_sb size: %12u\n", m_partfull_sb.size() );
+    printf( "\n" );
+    fflush( stdout );
 #endif
-  }
 
-  /// \brief Releases a lock on a freelist.
-  KOKKOS_FUNCTION
-  uint64_t acquire_lock( volatile uint64_t * freelist ) const;
+#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+    // Print the blocksize info for all the block sizes.
+    printf( "SIZE    BLOCKS_PER_SB    PAGES_PER_SB    SB_FULL_LEVEL    PAGE_FULL_LEVEL\n" );
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      printf( "%4zu    %13u    %12u    %13u    %15u\n", i + LG_MIN_BLOCK_SIZE,
+              m_blocksize_info[i].m_blocks_per_sb, m_blocksize_info[i].m_pages_per_sb,
+              m_blocksize_info[i].m_sb_full_level, m_blocksize_info[i].m_page_full_level );
+    }
+    printf( "\n" );
+#endif
+  }
 
-  /// \brief Releases a lock on a freelist.
+  /// \brief  The actual block size allocated given alloc_size.
+  KOKKOS_INLINE_FUNCTION
+  size_t allocate_block_size( const size_t alloc_size ) const
+  { return size_t(1) << ( get_block_size_index( alloc_size ) + LG_MIN_BLOCK_SIZE); }
+
+  /// \brief Allocate a chunk of memory.
+  /// \param alloc_size Size of the requested allocated in number of bytes.
+  ///
+  /// The function returns a void pointer to a memory location on success and
+  /// NULL on failure.
   KOKKOS_FUNCTION
-  void release_lock( volatile uint64_t * freelist, uint64_t new_head ) const;
+  void * allocate( size_t alloc_size ) const
+  {
+    void * p = 0;
 
-  /// \brief Tries to refill a freelist using a chunk from another freelist.
-  KOKKOS_FUNCTION
-  void * refill_freelist( size_t l_exp ) const;
+    // Only support allocations up to the superblock size.  Just return 0
+    // (failed allocation) for any size above this.
+    if (alloc_size <= m_sb_size )
+    {
+      int block_size_id = get_block_size_index( alloc_size );
+      uint32_t blocks_per_sb = m_blocksize_info[block_size_id].m_blocks_per_sb;
+      uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
+      unsigned word_size = blocks_per_sb > 32 ? 32 : blocks_per_sb;
+      unsigned word_mask = ( uint64_t(1) << word_size ) - 1;
 
-  /// \brief Claim chunks of untracked memory from the pool.
-  KOKKOS_FUNCTION
-  void * allocate( size_t alloc_size ) const;
+      uint32_t sb_id = volatile_load( &m_active(block_size_id) );
 
-  /// \brief Release claimed memory back into the pool.
-  KOKKOS_FUNCTION
-  void deallocate( void * alloc_ptr, size_t alloc_size ) const;
+      // If the active is locked, keep reading it until the lock is released.
+      while ( sb_id == SUPERBLOCK_LOCK ) {
+        sb_id = volatile_load( &m_active(block_size_id) );
+      }
 
-  // \brief Pulls the offset from a freelist head.
-  KOKKOS_INLINE_FUNCTION
-  uint64_t get_head_offset(uint64_t head) const
-  { return ( head >> TAGBITS ) << LG_MIN_CHUNKSIZE; }
+      bool allocation_done = false;
+
+      while (!allocation_done) {
+        bool need_new_sb = false;
+
+        if (sb_id != INVALID_SUPERBLOCK) {
+          // Use the value from the clock register as the hash value.
+          uint64_t hash_val = get_clock_register();
+
+          // Get the starting position for this superblock's bits in the bitset.
+          uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
+
+          // Mod the hash value to choose a page in the superblock.  The
+          // initial block searched is the first block of that page.
+          uint32_t pos_rel = uint32_t( hash_val & ( pages_per_sb - 1 ) ) << LG_BLOCKS_PER_PAGE;
+
+          // Get the absolute starting position for this superblock's bits in the bitset.
+          uint32_t pos = pos_base + pos_rel;
+
+          // Keep track of the number of pages searched.  Pages in the superblock are
+          // searched linearly from the starting page.  All pages in the superblock are
+          // searched until either a location is found, or it is proven empty.
+          uint32_t pages_searched = 0;
+
+          bool search_done = false;
+
+          while (!search_done) {
+            bool success;
+            unsigned prev_val;
+
+            Kokkos::tie( success, pos ) =
+              m_sb_blocks.set_any_in_word( pos, prev_val, word_mask );
+
+            if ( !success ) {
+              if ( ++pages_searched >= pages_per_sb ) {
+                // Searched all the pages in this superblock.  Look for a new superblock.
+                //
+                // The previous method tried limiting the number of pages searched, but
+                // that caused a huge performance issue in CUDA where the outer loop
+                // executed massive numbers of times.  Threads weren't able to find a
+                // free location when the superblock wasn't full and were able to execute
+                // the outer loop many times before the superblock was switched for a new
+                // one.  Switching to an exhaustive search eliminated this possiblity and
+                // didn't slow anything down for the tests.
+                need_new_sb = true;
+                search_done = true;
+              }
+              else {
+                // Move to the next page making sure the new search position
+                // doesn't go past this superblock's bits.
+                pos += BLOCKS_PER_PAGE;
+                pos = ( pos < pos_base + blocks_per_sb ) ? pos : pos_base;
+              }
+            }
+            else {
+              // Reserved a memory location to allocate.
+              search_done = true;
+              allocation_done = true;
+
+              uint32_t lg_block_size = block_size_id + LG_MIN_BLOCK_SIZE;
+
+              p = m_data + ( size_t(sb_id) << m_lg_sb_size ) +
+                  ( ( pos - pos_base ) << lg_block_size );
+
+              uint32_t used_bits = Kokkos::Impl::bit_count( prev_val );
+
+              if ( used_bits == 0 ) {
+                // This page was empty.  Decrement the number of empty pages for
+                // the superblock.
+                atomic_fetch_sub( &m_sb_header(sb_id).m_empty_pages, 1 );
+              }
+              else if ( used_bits == m_blocksize_info[block_size_id].m_page_full_level - 1 )
+              {
+                // This page is full.  Increment the number of full pages for
+                // the superblock.
+                uint32_t full_pages = atomic_fetch_add( &m_sb_header(sb_id).m_full_pages, 1 );
+
+                // This allocation made the superblock full, so a new one needs to be found.
+                if ( full_pages == m_blocksize_info[block_size_id].m_sb_full_level - 1 ) {
+                  need_new_sb = true;
+                }
+              }
+            }
+          }
+        }
+        else {
+          // This is the first allocation for this block size.  A superblock needs
+          // to be set as the active one.  If this point is reached any other time,
+          // it is an error.
+          need_new_sb = true;
+        }
+
+        if ( need_new_sb ) {
+          uint32_t new_sb_id = find_superblock( block_size_id, sb_id );
+
+          if ( new_sb_id == sb_id ) {
+            allocation_done = true;
+#ifdef KOKKOS_MEMPOOL_PRINT_INFO
+            printf( "** No superblocks available. **\n" );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+            fflush( stdout );
+#endif
+#endif
+          }
+          else {
+            sb_id = new_sb_id;
+          }
+        }
+      }
+    }
+#ifdef KOKKOS_MEMPOOL_PRINT_INFO
+    else {
+      printf( "** Requested allocation size (%zu) larger than superblock size (%lu). **\n",
+              alloc_size, m_sb_size);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+    }
+#endif
 
-  // \brief Pulls the tag from a freelist head.
-  KOKKOS_INLINE_FUNCTION
-  uint64_t get_head_tag(uint64_t head) const { return head & TAGBITS_MASK; }
-  // \brief Creates a freelist head from a offset and tag.
-  KOKKOS_INLINE_FUNCTION
-  uint64_t create_head(uint64_t offset, uint64_t tag) const
-  { return ( ( offset >> LG_MIN_CHUNKSIZE ) << TAGBITS ) | tag; }
+    return p;
+  }
 
-  // \brief Increments a tag.
-  KOKKOS_INLINE_FUNCTION
-  uint64_t increment_tag(uint64_t tag) const { return ( tag + 1 ) & TAGBITS_MASK; }
+  /// \brief Release allocated memory back to the pool.
+  /// \param alloc_ptr Pointer to chunk of memory previously allocated by
+  ///                  the allocator.
+  /// \param alloc_size Size of the allocated memory in number of bytes.
+  KOKKOS_FUNCTION
+  void deallocate( void * alloc_ptr, size_t alloc_size ) const
+  {
+    char * ap = static_cast<char *>( alloc_ptr );
+
+    // Only deallocate memory controlled by this pool.
+    if ( ap >= m_data && ap + alloc_size <= m_data + m_data_size ) {
+      // Get the superblock for the address.  This can be calculated by math on
+      // the address since the superblocks are stored contiguously in one memory
+      // chunk.
+      uint32_t sb_id = ( ap - m_data ) >> m_lg_sb_size;
+
+      // Get the starting position for this superblock's bits in the bitset.
+      uint32_t pos_base = sb_id << m_lg_max_sb_blocks;
+
+      // Get the relative position for this memory location's bit in the bitset.
+      uint32_t offset = ( ap - m_data ) - ( size_t(sb_id) << m_lg_sb_size );
+      uint32_t lg_block_size = m_sb_header(sb_id).m_lg_block_size;
+      uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
+      uint32_t pos_rel = offset >> lg_block_size;
+
+      bool success;
+      unsigned prev_val;
+
+      Kokkos::tie( success, prev_val ) = m_sb_blocks.fetch_word_reset( pos_base + pos_rel );
+
+      // If the memory location was previously deallocated, do nothing.
+      if ( success ) {
+        uint32_t page_fill_level = Kokkos::Impl::bit_count( prev_val );
+
+        if ( page_fill_level == 1 ) {
+          // This page is now empty.  Increment the number of empty pages for the
+          // superblock.
+          uint32_t empty_pages = atomic_fetch_add( &m_sb_header(sb_id).m_empty_pages, 1 );
+
+          if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
+               empty_pages == m_blocksize_info[block_size_id].m_pages_per_sb - 1 )
+          {
+            // This deallocation caused the superblock to be empty.  Change the
+            // superblock category from partially full to empty.
+            unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
+
+            if ( m_partfull_sb.reset( pos ) ) {
+              // Reset the empty pages and block size for the superblock.
+              volatile_store( &m_sb_header(sb_id).m_empty_pages, uint32_t(0) );
+              volatile_store( &m_sb_header(sb_id).m_lg_block_size, uint32_t(0) );
+
+              memory_fence();
+
+              m_empty_sb.set( sb_id );
+            }
+          }
+        }
+        else if ( page_fill_level == m_blocksize_info[block_size_id].m_page_full_level ) {
+          // This page is no longer full.  Decrement the number of full pages for
+          // the superblock.
+          uint32_t full_pages = atomic_fetch_sub( &m_sb_header(sb_id).m_full_pages, 1 );
+
+          if ( !volatile_load( &m_sb_header(sb_id).m_is_active ) &&
+               full_pages == m_blocksize_info[block_size_id].m_sb_full_level )
+          {
+            // This deallocation caused the number of full pages to decrease below
+            // the full threshold.  Change the superblock category from full to
+            // partially full.
+            unsigned pos = block_size_id * m_ceil_num_sb + sb_id;
+            m_partfull_sb.set( pos );
+          }
+        }
+      }
+    }
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+    else {
+      printf( "\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n",
+              reinterpret_cast<uint64_t>( alloc_ptr ) );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+      fflush( stdout );
+#endif
+    }
+#endif
+  }
 
-  /// \brief Tests if the memory pool is empty.
+  /// \brief Tests if the memory pool has no more memory available to allocate.
   KOKKOS_INLINE_FUNCTION
   bool is_empty() const
   {
-    size_t l = 0;
-    while ( m_chunk_size[l] > 0 &&
-            get_head_offset( m_freelist[l] ) == FREELIST_END )
-    {
-      ++l;
+    // The allocator is empty if all superblocks are full.  A superblock is
+    // full if it has >= 80% of its pages allocated.
+
+    // Look at all the superblocks.  If one is not full, then the allocator
+    // isn't empty.
+    for ( size_t i = 0; i < m_num_sb; ++i ) {
+      uint32_t lg_block_size = m_sb_header(i).m_lg_block_size;
+
+      // A superblock only has a block size of 0 when it is empty.
+      if ( lg_block_size == 0 ) return false;
+
+      uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
+      uint32_t full_pages = volatile_load( &m_sb_header(i).m_full_pages );
+
+      if ( full_pages < m_blocksize_info[block_size_id].m_sb_full_level ) return false;
     }
 
-    return m_chunk_size[l] == 0;
+    // All the superblocks were full.  The allocator is empty.
+    return true;
   }
 
   // The following functions are used for debugging.
   void print_status() const
   {
-    for ( size_t l = 0; m_chunk_size[l] > 0; ++l ) {
-      size_t count = 0;
-      uint64_t chunk = get_head_offset( m_freelist[l] );
+    printf( "\n" );
 
-      while ( chunk != FREELIST_END ) {
-        ++count;
-        chunk = *reinterpret_cast<uint64_t *>( m_data + chunk );
-      }
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+    typename SBHeaderView::HostMirror host_sb_header = create_mirror_view(m_sb_header);
+    deep_copy( host_sb_header, m_sb_header );
+
+    UInt32View num_allocated_blocks( "Allocated Blocks", m_num_sb );
 
-      printf( "chunk_size: %6lu    num_chunks: %8lu\n", m_chunk_size[l], count );
-      fflush(stdout);
+    // Count the number of allocated blocks per superblock.
+    {
+      MempoolImpl::count_allocated_blocks< UInt32View, SBHeaderView, MempoolBitset >
+        mch( m_num_sb, num_allocated_blocks, m_sb_header,
+             m_sb_blocks, m_sb_size, m_lg_max_sb_blocks );
     }
-  }
 
-  KOKKOS_INLINE_FUNCTION
-  size_t get_min_chunk_size() const { return m_chunk_size[0]; }
+    typename UInt32View::HostMirror host_num_allocated_blocks =
+      create_mirror_view(num_allocated_blocks);
+    deep_copy( host_num_allocated_blocks, num_allocated_blocks );
+
+    // Print header info of all superblocks.
+    printf( "SB_ID    SIZE    ACTIVE    EMPTY_PAGES    FULL_PAGES    USED_BLOCKS\n" );
+    for ( size_t i = 0; i < m_num_sb; ++i ) {
+      printf( "%5zu    %4u    %6d    %11u    %10u     %10u\n", i,
+              host_sb_header(i).m_lg_block_size, host_sb_header(i).m_is_active,
+              host_sb_header(i).m_empty_pages, host_sb_header(i).m_full_pages,
+              host_num_allocated_blocks(i) );
+    }
 
-  size_t get_mem_size() const { return m_data_size; }
-};
+    printf( "\n" );
+#endif
 
-} // namespace Impl
-} // namespace Experimental
-} // namespace Kokkos
+    UInt32View page_histogram( "Page Histogram", 33 );
 
-//----------------------------------------------------------------------------
-/*  Prefer to implement these functions in a separate
- *  compilation unit.  For CUDA this requires nvcc command
- *  --relocatable-device-code=true
- *  When this command is set then the macro
- *  KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
- *  is also set.
- */
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && \
-    ! defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
-
-#include <impl/Kokkos_MemoryPool_Inline.hpp>
+    // Get a View version of the blocksize info.
+    typedef View< BlockSizeHeader *, device_type >  BSHeaderView;
+    BSHeaderView blocksize_info( "BlockSize Headers", MAX_BLOCK_SIZES );
 
-#endif
+    Kokkos::Impl::DeepCopy< backend_memory_space, Kokkos::HostSpace >
+      dc( blocksize_info.ptr_on_device(), m_blocksize_info,
+          sizeof(BlockSizeHeader) * m_num_block_size );
 
-//----------------------------------------------------------------------------
+    Kokkos::pair< double, uint32_t > result = Kokkos::pair< double, uint32_t >( 0.0, 0 );
 
-namespace Kokkos {
-namespace Experimental {
+    // Create the page histogram.
+    {
+      MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
+        mch( 0, m_num_sb, page_histogram, blocksize_info, m_sb_header, m_sb_blocks,
+             m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
+    }
 
-/// \class MemoryPool
-/// \brief Memory management for pool of same-sized chunks of memory.
-///
-/// MemoryPool is a memory space that can be on host or device.  It provides a
-/// pool memory allocator for fast allocation of same-sized chunks of memory.
-/// The memory is only accessible on the host / device this allocator is
-/// associated with.
-template < typename Space , typename ExecSpace >
-class MemoryPool {
-private:
+    typename UInt32View::HostMirror host_page_histogram = create_mirror_view(page_histogram);
+    deep_copy( host_page_histogram, page_histogram );
 
-  Impl::MemPoolList  m_memory;
+    // Find the used and total pages and blocks.
+    uint32_t used_pages = 0;
+    uint32_t used_blocks = 0;
+    for ( uint32_t i = 1; i < 33; ++i ) {
+      used_pages += host_page_histogram(i);
+      used_blocks += i * host_page_histogram(i);
+    }
+    uint32_t total_pages = used_pages + host_page_histogram(0);
+
+    unsigned num_empty_sb = m_empty_sb.count();
+    unsigned num_non_empty_sb = m_num_sb - num_empty_sb;
+    unsigned num_partfull_sb = m_partfull_sb.count();
 
-  typedef ExecSpace                     execution_space;
-  typedef typename Space::memory_space  backend_memory_space;
+    uint32_t total_blocks = result.second;
+    double ave_sb_full = num_non_empty_sb == 0 ? 0.0 : result.first / num_non_empty_sb;
+    double percent_used_sb = double( m_num_sb - num_empty_sb ) / m_num_sb;
+    double percent_used_pages = total_pages == 0 ? 0.0 : double(used_pages) / total_pages;
+    double percent_used_blocks = total_blocks == 0 ? 0.0 : double(used_blocks) / total_blocks;
 
-#if defined( KOKKOS_HAVE_CUDA )
+    // Count active superblocks.
+    typename UInt32View::HostMirror host_active = create_mirror_view(m_active);
+    deep_copy(host_active, m_active);
 
-  // Current implementation requires CudaUVM memory space
-  // for Cuda memory pool.
+    unsigned num_active_sb = 0;
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      num_active_sb += host_active(i) != INVALID_SUPERBLOCK;
+    }
 
-  static_assert(
-    ! std::is_same< typename Space::memory_space , Kokkos::CudaSpace >::value ,
-    "Kokkos::MemoryPool currently cannot use Kokkos::CudaSpace, you must use Kokkos::CudaUVMSpace" );
+#ifdef KOKKOS_MEMPOOL_PRINT_ACTIVE_SUPERBLOCKS
+    // Print active superblocks.
+    printf( "BS_ID      SB_ID\n" );
+    for ( size_t i = 0; i < m_num_block_size; ++i ) {
+      uint32_t sb_id = host_active(i);
 
+      if ( sb_id == INVALID_SUPERBLOCK ) {
+        printf( "%5zu          I\n", i );
+      }
+      else if ( sb_id == SUPERBLOCK_LOCK ) {
+        printf( "%5zu          L\n", i );
+      }
+      else {
+        printf( "%5zu    %7u\n", i, sb_id );
+      }
+    }
+    printf( "\n" );
+    fflush( stdout );
 #endif
 
-public:
+#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+    // Print the summary page histogram.
+    printf( "USED_BLOCKS    PAGE_COUNT\n" );
+    for ( uint32_t i = 0; i < 33; ++i ) {
+      printf( "%10u    %10u\n", i, host_page_histogram[i] );
+    }
+    printf( "\n" );
+#endif
 
-  //! Tag this class as a kokkos memory space
-  typedef MemoryPool  memory_space;
+#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+    // Print the page histogram for a few individual superblocks.
+//    const uint32_t num_sb_id = 2;
+//    uint32_t sb_id[num_sb_id] = { 0, 10 };
+    const uint32_t num_sb_id = 1;
+    uint32_t sb_id[num_sb_id] = { 0 };
 
-  //------------------------------------
+    for ( uint32_t i = 0; i < num_sb_id; ++i ) {
+      deep_copy( page_histogram, 0 );
 
-  MemoryPool() = default;
-  MemoryPool( MemoryPool && rhs ) = default;
-  MemoryPool( const MemoryPool & rhs ) = default;
-  MemoryPool & operator = ( MemoryPool && ) = default;
-  MemoryPool & operator = ( const MemoryPool & ) = default;
-  ~MemoryPool() = default;
+      {
+        MempoolImpl::create_histogram< UInt32View, BSHeaderView, SBHeaderView, MempoolBitset >
+          mch( sb_id[i], sb_id[i] + 1, page_histogram, blocksize_info, m_sb_header,
+               m_sb_blocks, m_lg_max_sb_blocks, LG_MIN_BLOCK_SIZE, BLOCKS_PER_PAGE, result );
+      }
 
-  /// \brief Allocate memory pool
-  /// \param memspace         From where to allocate the pool.
-  /// \param base_chunk_size  Hand out memory in chunks of this size.
-  /// \param total_size       Total size of the pool.
-  MemoryPool( const backend_memory_space & memspace,
-              size_t base_chunk_size, size_t total_size,
-              size_t num_chunk_sizes = 4, size_t chunk_spacing = 4 )
-    : m_memory( memspace, execution_space(), base_chunk_size, total_size,
-                num_chunk_sizes, chunk_spacing )
-  {}
-
-  /// \brief Claim chunks of untracked memory from the pool.
-  /// Can only be called from device.
-  KOKKOS_INLINE_FUNCTION
-  void * allocate( const size_t alloc_size ) const
-  { return m_memory.allocate( alloc_size ); }
+      deep_copy( host_page_histogram, page_histogram );
 
-  /// \brief Release claimed memory back into the pool
-  /// Can only be called from device.
-  KOKKOS_INLINE_FUNCTION
-  void deallocate( void * const alloc_ptr, const size_t alloc_size ) const
-  { m_memory.deallocate( alloc_ptr, alloc_size ); }
+      printf( "SB_ID    USED_BLOCKS    PAGE_COUNT\n" );
+      for ( uint32_t j = 0; j < 33; ++j ) {
+        printf( "%5u    %10u    %10u\n", sb_id[i], j, host_page_histogram[j] );
+      }
+      printf( "\n" );
+    }
 
-  /// \brief Is out of memory at this instant
-  KOKKOS_INLINE_FUNCTION
-  bool is_empty() const { return m_memory.is_empty(); }
+/*
+    // Print the blocks used for each page of a few individual superblocks.
+    for ( uint32_t i = 0; i < num_sb_id; ++i ) {
+      uint32_t lg_block_size = host_sb_header(sb_id[i]).m_lg_block_size;
+      if ( lg_block_size != 0 ) {
+        printf( "SB_ID    BLOCK ID    USED_BLOCKS\n" );
+
+        uint32_t block_size_id = lg_block_size - LG_MIN_BLOCK_SIZE;
+        uint32_t pages_per_sb = m_blocksize_info[block_size_id].m_pages_per_sb;
+
+        for ( uint32_t j = 0; j < pages_per_sb; ++j ) {
+          unsigned start_pos = ( sb_id[i] << m_lg_max_sb_blocks ) + j * BLOCKS_PER_PAGE;
+          unsigned end_pos = start_pos + BLOCKS_PER_PAGE;
+          uint32_t num_allocated_blocks = 0;
+
+          for ( unsigned k = start_pos; k < end_pos; ++k ) {
+            num_allocated_blocks += m_sb_blocks.test( k );
+          }
+
+          printf( "%5u    %8u    %11u\n", sb_id[i], j, num_allocated_blocks );
+        }
+
+        printf( "\n" );
+      }
+    }
+*/
+#endif
+
+    printf( "   Used blocks: %10u / %10u = %10.6lf\n", used_blocks, total_blocks,
+           percent_used_blocks );
+    printf( "    Used pages: %10u / %10u = %10.6lf\n", used_pages, total_pages,
+           percent_used_pages );
+    printf( "       Used SB: %10zu / %10zu = %10.6lf\n", m_num_sb - num_empty_sb, m_num_sb,
+           percent_used_sb );
+    printf( "     Active SB: %10u\n", num_active_sb );
+    printf( "      Empty SB: %10u\n", num_empty_sb );
+    printf( "   Partfull SB: %10u\n", num_partfull_sb );
+    printf( "       Full SB: %10lu\n",
+           m_num_sb - num_active_sb - num_empty_sb - num_partfull_sb );
+    printf( "Ave. SB Full %%: %10.6lf\n", ave_sb_full );
+    printf( "\n" );
+    fflush( stdout );
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+    fflush( stdout );
+#endif
+  }
 
-  /// \brief Minimum chunk size allocatable.
   KOKKOS_INLINE_FUNCTION
-  size_t get_min_chunk_size() const { return m_memory.get_min_chunk_size(); }
+  size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
 
-  // The following functions are used for debugging.
-  void print_status() const { m_memory.print_status(); }
-  size_t get_mem_size() const { return m_memory.get_mem_size(); }
+  size_t get_mem_size() const { return m_data_size; }
+
+private:
+  /// \brief Returns the index into the active array for the given size.
+  ///
+  /// Computes log2 of the largest power of two >= the given size
+  /// ( ie ceil( log2(size) ) ) shifted by LG_MIN_BLOCK_SIZE.
+  KOKKOS_FORCEINLINE_FUNCTION
+  int get_block_size_index( const size_t size ) const
+  {
+    // We know the size fits in a 32 bit unsigned because the size of a
+    // superblock is limited to 2^31, so casting to an unsigned is safe.
+
+    // Find the most significant nonzero bit.
+    uint32_t first_nonzero_bit =
+      Kokkos::Impl::bit_scan_reverse( static_cast<unsigned>( size ) );
+
+    // If size is an integral power of 2, ceil( log2(size) ) is equal to the
+    // most significant nonzero bit.  Otherwise, you need to add 1.  Since the
+    // minimum block size is MIN_BLOCK_SIZE, make sure ceil( log2(size) ) is at
+    // least LG_MIN_BLOCK_SIZE.
+    uint32_t lg2_size = first_nonzero_bit + !Kokkos::Impl::is_integral_power_of_two( size );
+    lg2_size = lg2_size > LG_MIN_BLOCK_SIZE ? lg2_size : LG_MIN_BLOCK_SIZE;
+
+    // Return ceil( log2(size) ) shifted so that the value for MIN_BLOCK_SIZE
+    // is 0.
+    return lg2_size - LG_MIN_BLOCK_SIZE;
+  }
+
+  /// \brief Finds a superblock with free space to become a new active superblock.
+  ///
+  /// If this function is called, the current active superblock needs to be replaced
+  /// because it is full.  Initially, only the thread that sets the active superblock
+  /// to full calls this function.  Other threads can still allocate from the "full"
+  /// active superblock because a full superblock still has locations available.  If
+  /// a thread tries to allocate from the active superblock when it has no free
+  /// locations, then that thread will call this function, too, and spin on a lock
+  /// waiting until the active superblock has been replaced.
+  KOKKOS_FUNCTION
+  uint32_t find_superblock( int block_size_id, uint32_t old_sb ) const
+  {
+    // Try to grab the lock on the head.
+    uint32_t lock_sb =
+      Kokkos::atomic_compare_exchange( &m_active(block_size_id), old_sb, SUPERBLOCK_LOCK );
+
+    // Initialize the new superblock to be the previous one so the previous
+    // superblock is returned if a new superblock can't be found.
+    uint32_t new_sb = lock_sb;
+
+    if ( lock_sb == old_sb ) {
+      // This thread has the lock.
+
+      // 1. Look for a partially filled superblock that is of the right block
+      //    size.
+
+      size_t max_tries = m_ceil_num_sb >> LG_BLOCKS_PER_PAGE;
+      size_t tries = 0;
+      bool search_done = false;
+
+      // Set the starting search position to the beginning of this block
+      // size's bitset.
+      unsigned pos = block_size_id * m_ceil_num_sb;
+
+      while (!search_done) {
+        bool success = false;
+        unsigned prev_val;
+
+        Kokkos::tie( success, pos ) = m_partfull_sb.reset_any_in_word( pos, prev_val );
+
+        if ( !success ) {
+          if ( ++tries >= max_tries ) {
+            // Exceeded number of words for this block size's bitset.
+            search_done = true;
+          }
+          else {
+            pos += BLOCKS_PER_PAGE;
+          }
+        }
+        else {
+          // Found a superblock.
+          search_done = true;
+          new_sb = pos - block_size_id * m_ceil_num_sb;
+
+          // Assertions:
+          //   1. A different superblock than the current should be found.
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+          if ( new_sb == lock_sb ) {
+            printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n",
+                    new_sb);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+            fflush( stdout );
+#endif
+            Kokkos::abort( "" );
+          }
+#endif
+
+          // Set the head status for the superblock.
+          volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) );
+
+          // If there was a previous active superblock, mark it as not active.
+          // It is now in the full category and as such isn't tracked.
+          if ( lock_sb != INVALID_SUPERBLOCK ) {
+            volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) );
+          }
+
+          memory_fence();
+        }
+      }
+
+      // 2. Look for an empty superblock.
+      if ( new_sb == lock_sb ) {
+        tries = 0;
+        search_done = false;
+
+        // Set the starting search position to the beginning of this block
+        // size's bitset.
+        pos = 0;
+
+        while (!search_done) {
+          bool success = false;
+          unsigned prev_val;
+
+          Kokkos::tie( success, pos ) = m_empty_sb.reset_any_in_word( pos, prev_val );
+
+          if ( !success ) {
+            if ( ++tries >= max_tries ) {
+              // Exceeded number of words for this block size's bitset.
+              search_done = true;
+            }
+            else {
+              pos += BLOCKS_PER_PAGE;
+            }
+          }
+          else {
+            // Found a superblock.
+            search_done = true;
+            new_sb = pos;
+
+            // Assertions:
+            //   1. A different superblock than the current should be found.
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+            if ( new_sb == lock_sb ) {
+              printf( "\n** MemoryPool::find_superblock() FOUND_SAME_SUPERBLOCK: %u **\n",
+                      new_sb);
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+              fflush( stdout );
+#endif
+              Kokkos::abort( "" );
+            }
+#endif
+
+            // Set the empty pages, block size, and head status for the
+            // superblock.
+            volatile_store( &m_sb_header(new_sb).m_empty_pages,
+                            m_blocksize_info[block_size_id].m_pages_per_sb );
+            volatile_store( &m_sb_header(new_sb).m_lg_block_size,
+                            block_size_id + LG_MIN_BLOCK_SIZE );
+            volatile_store( &m_sb_header(new_sb).m_is_active, uint32_t(true) );
+
+            // If there was a previous active superblock, mark it as not active.
+            // It is now in the full category and as such isn't tracked.
+            if ( lock_sb != INVALID_SUPERBLOCK ) {
+              volatile_store( &m_sb_header(lock_sb).m_is_active, uint32_t(false) );
+            }
+
+            memory_fence();
+          }
+        }
+      }
+
+      // Write the new active superblock to release the lock.
+      atomic_exchange( &m_active(block_size_id), new_sb );
+    }
+    else {
+      // Either another thread has the lock and is switching the active superblock for
+      // this block size or another thread has already changed the active superblock
+      // since this thread read its value.  Keep reading the active superblock until
+      // it isn't locked to get the new active superblock.
+      do {
+        new_sb = volatile_load( &m_active(block_size_id) );
+      } while ( new_sb == SUPERBLOCK_LOCK );
+
+      // Assertions:
+      //   1. An invalid superblock should never be found here.
+      //   2. If the new superblock is the same as the previous superblock, the
+      //      allocator is empty.
+#ifdef KOKKOS_MEMPOOL_PRINTERR
+      if ( new_sb == INVALID_SUPERBLOCK ) {
+        printf( "\n** MemoryPool::find_superblock() FOUND_INACTIVE_SUPERBLOCK **\n" );
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+        fflush( stdout );
+#endif
+        Kokkos::abort( "" );
+      }
+#endif
+    }
+
+    return new_sb;
+  }
+
+  /// Returns 64 bits from a clock register.
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint64_t get_clock_register(void) const
+  {
+#if defined( __CUDA_ARCH__ )
+    // Return value of 64-bit hi-res clock register.
+	  return clock64();
+#elif defined( __i386__ ) || defined( __x86_64 )
+    // Return value of 64-bit hi-res clock register.
+    unsigned a, d;
+    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
+    return ( (uint64_t) a) | ( ( (uint64_t) d ) << 32 );
+#else
+    const uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
+    return ticks;
+#endif
+  }
 };
 
 } // namespace Experimental
@@ -583,4 +1501,23 @@ public:
 #undef KOKKOS_MEMPOOL_PRINT_INFO
 #endif
 
-#endif /* #define KOKKOS_MEMORYPOOL_HPP */
+#ifdef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#undef KOKKOS_MEMPOOL_PRINT_BLOCKSIZE_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#undef KOKKOS_MEMPOOL_PRINT_SUPERBLOCK_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+#undef KOKKOS_MEMPOOL_PRINT_PAGE_INFO
+#endif
+
+#ifdef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#undef KOKKOS_MEMPOOL_PRINT_INDIVIDUAL_PAGE_INFO
+#endif
+
+#undef KOKKOS_MEMPOOL_SB_FULL_FRACTION
+#undef KOKKOS_MEMPOOL_PAGE_FULL_FRACTION
+
+#endif // KOKKOS_MEMORYPOOL_HPP
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
index 389ee4b2fd..7be4f8245f 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -58,9 +58,11 @@
 #endif
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
@@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace
 
 #include <OpenMP/Kokkos_OpenMPexec.hpp>
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMP_Task.hpp>
 
 /*--------------------------------------------------------------------------*/
 
diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp
index 7e906a4571..83436826f4 100644
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@@ -1,12 +1,12 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -35,7 +35,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 
@@ -125,17 +125,26 @@ struct pair
     return *this;
   }
 
-  /// \brief Assignment operator.
+
+  /// \brief Assignment operator, for volatile <tt>*this</tt>.
   ///
-  /// This calls the assignment operators of T1 and T2.  It won't
+  /// \param p [in] Input; right-hand side of the assignment.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It will not
   /// compile if the assignment operators are not defined and public.
+  ///
+  /// This operator returns \c void instead of <tt>volatile pair<T1,
+  /// T2>& </tt>.  See Kokkos Issue #177 for the explanation.  In
+  /// practice, this means that you should not chain assignments with
+  /// volatile lvalues.
   template <class U, class V>
   KOKKOS_FORCEINLINE_FUNCTION
-  volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile
+  void operator=(const volatile pair<U,V> &p) volatile
   {
     first = p.first;
     second = p.second;
-    return *this;
+    // We deliberately do not return anything here.  See explanation
+    // in public documentation above.
   }
 
   // from std::pair<U,V>
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
index edaced22a9..588dc90af3 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -57,7 +57,6 @@
 #include <typeinfo>
 #endif
 
-#include <impl/Kokkos_AllocationTracker.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
@@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy  & policy
 {
 #if (KOKKOS_ENABLE_PROFILING)
     uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-     	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+     	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
 
@@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy  & policy
    closure.execute();
 
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-        Kokkos::Experimental::endParallelFor(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
      }
 #endif
 }
@@ -210,8 +209,8 @@ void parallel_for( const size_t        work_count
 
 #if (KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+  	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
     
@@ -222,8 +221,8 @@ void parallel_for( const size_t        work_count
   closure.execute();
 
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelFor(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelFor(kpID);
      }
 #endif
 }
@@ -248,405 +247,9 @@ void parallel_for( const std::string & str
   (void) str;
 }
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** \brief  Parallel reduction
- *
- * Example of a parallel_reduce functor for a POD (plain old data) value type:
- * \code
- *  class FunctorType { // For POD value type
- *  public:
- *    typedef    ...     execution_space ;
- *    typedef <podType>  value_type ;
- *    void operator()( <intType> iwork , <podType> & update ) const ;
- *    void init( <podType> & update ) const ;
- *    void join( volatile       <podType> & update ,
- *               volatile const <podType> & input ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> & update ) const ;
- *  };
- * \endcode
- *
- * Example of a parallel_reduce functor for an array of POD (plain old data) values:
- * \code
- *  class FunctorType { // For array of POD value
- *  public:
- *    typedef    ...     execution_space ;
- *    typedef <podType>  value_type[] ;
- *    void operator()( <intType> , <podType> update[] ) const ;
- *    void init( <podType> update[] ) const ;
- *    void join( volatile       <podType> update[] ,
- *               volatile const <podType> input[] ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> update[] ) const ;
- *  };
- * \endcode
- */
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , const std::string& str = ""
-                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
-                    )
-{
-  // typedef typename
-  //   Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-  //     execution_space ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view ;
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-    Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-    Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
-    Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-    closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-}
-
-// integral range policy
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , const std::string& str = ""
-                    )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > policy ;
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view ;
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// general policy and view ouput
-template< class ExecPolicy , class FunctorType , class ViewType >
-inline
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , const ViewType    & result_view
-                    , const std::string& str = ""
-                    , typename Impl::enable_if<
-                      ( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
-#ifdef KOKKOS_HAVE_CUDA
-                        && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
-#endif
-                      )>::type * = 0 )
-{
-    
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// general policy and pod or array of pod output
-template< class ExecPolicy , class FunctorType >
-void parallel_reduce( const ExecPolicy  & policy
-                    , const FunctorType & functor
-#ifdef KOKKOS_HAVE_CUDA
-                    , typename Impl::enable_if<
-                      ( ! Impl::is_integral< ExecPolicy >::value &&
-                        ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
-                      , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
-                      , const std::string& str = ""
-                      , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
-                      )
-#else
-                      , typename Impl::enable_if<
-                        ( ! Impl::is_integral< ExecPolicy >::value)
-                        , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
-                        >::type result_ref
-                      , const std::string& str = ""
-                        )
-#endif
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result_ref )
-               , ValueTraits::value_count( functor )
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-    
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
 }
 
-// integral range policy and view ouput
-template< class FunctorType , class ViewType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , const ViewType    & result_view
-                    , const std::string& str = ""
-                    , typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
-#ifdef KOKKOS_HAVE_CUDA
-                        && ! Impl::is_same<
-                          typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
-                          Kokkos::Cuda>::value
-#endif
-                        )>::type * = 0 )
-{
-  typedef typename
-    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef RangePolicy< execution_space > ExecPolicy ;
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-    
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-
-// integral range policy and pod or array of pod output
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count
-                    , const FunctorType & functor
-                    , typename Kokkos::Impl::FunctorValueTraits<
-                         typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
-                                             Impl::is_integral<FunctorType>::value,
-                            void,FunctorType>::type
-                         , void >::reference_type result
-                    , const std::string& str = ""
-                    , typename Impl::enable_if< true
-#ifdef KOKKOS_HAVE_CUDA
-                              && ! Impl::is_same<
-                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
-                             Kokkos::Cuda>::value
-#endif
-                     >::type * = 0 )
-{
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
-
-  typedef typename
-    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
-      execution_space ;
-
-  typedef Kokkos::RangePolicy< execution_space > policy ;
-
-  // Wrap the result output request in a view to inform the implementation
-  // of the type and memory space.
-
-  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
-                                     , typename ValueTraits::value_type
-                                     , typename ValueTraits::pointer_type
-                                     >::type value_type ;
-
-  Kokkos::View< value_type
-              , HostSpace
-              , Kokkos::MemoryUnmanaged
-              >
-    result_view( ValueOps::pointer( result )
-               , ValueTraits::value_count( functor )
-               );
-
-#if (KOKKOS_ENABLE_PROFILING)
-  uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
-     }
-#endif
-
-  Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
-  Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
-  Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-
-  closure.execute();
-
-#if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelReduce(kpID);
-     }
-#endif
-
-}
-#ifndef KOKKOS_HAVE_CUDA
-template< class ExecPolicy , class FunctorType , class ResultType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , ResultType * result)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,result,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-
-template< class ExecPolicy , class FunctorType , class ResultType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor
-                    , ResultType & result)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,result,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-
-template< class ExecPolicy , class FunctorType >
-inline
-void parallel_reduce( const std::string & str
-                    , const ExecPolicy  & policy
-                    , const FunctorType & functor)
-{
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
-  #endif
-
-  parallel_reduce(policy,functor,str);
-
-  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
-  #endif
-  (void) str;
-}
-#endif
-
-} // namespace Kokkos
-
+#include <Kokkos_Parallel_Reduce.hpp>
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy
 {
 #if (KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
 
@@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy
   closure.execute();
 
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelScan(kpID);
      }
 #endif
 
@@ -849,8 +452,8 @@ void parallel_scan( const size_t        work_count
 
 #if (KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
     
@@ -861,8 +464,8 @@ void parallel_scan( const size_t        work_count
   closure.execute();
 
 #if (KOKKOS_ENABLE_PROFILING)
-     if(Kokkos::Experimental::profileLibraryLoaded()) {
-	Kokkos::Experimental::endParallelScan(kpID);
+     if(Kokkos::Profiling::profileLibraryLoaded()) {
+	Kokkos::Profiling::endParallelScan(kpID);
      }
 #endif
 
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
new file mode 100644
index 0000000000..695bc79a1a
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -0,0 +1,1240 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+namespace Kokkos {
+
+
+template<class T, class Enable = void>
+struct is_reducer_type {
+  enum { value = 0 };
+};
+
+
+template<class T>
+struct is_reducer_type<T,typename std::enable_if<
+                       std::is_same<T,typename T::reducer_type>::value
+                      >::type> {
+  enum { value = 1 };
+};
+
+namespace Experimental {
+
+
+template<class Scalar,class Space = HostSpace>
+struct Sum {
+public:
+  //Required
+  typedef Sum reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return static_cast<value_type>(0);
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Sum(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Sum(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Sum(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Sum(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest += src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar,class Space = HostSpace>
+struct Prod {
+public:
+  //Required
+  typedef Prod reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return static_cast<value_type>(1);
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Prod(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Prod(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Prod(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Prod(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest *= src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest *= src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct Min {
+public:
+  //Required
+  typedef Min reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<value_type>::max();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Min(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Min(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Min(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Min(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src < dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src < dest )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct Max {
+public:
+  //Required
+  typedef Max reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<value_type>::min();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return value_type();
+    }
+  };
+
+public:
+
+  Max(value_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(&result_) {}
+  Max(const result_view_type& result_):
+    init_value(InitWrapper<value_type>::value()),result(result_) {}
+  Max(value_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  Max(const result_view_type& result_, const value_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src > dest )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src > dest )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct LAnd {
+public:
+  //Required
+  typedef LAnd reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  LAnd(value_type& result_):result(&result_) {}
+  LAnd(const result_view_type& result_):result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest && src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest && src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = 1;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct LOr {
+public:
+  //Required
+  typedef LOr reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  LOr(value_type& result_):result(&result_) {}
+  LOr(const result_view_type& result_):result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest || src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest || src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = 0;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct LXor {
+public:
+  //Required
+  typedef LXor reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  LXor(value_type& result_):result(&result_) {}
+  LXor(const result_view_type& result_):result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest = dest? (!src) : src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest? (!src) : src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = 0;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct BAnd {
+public:
+  //Required
+  typedef BAnd reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+public:
+
+  BAnd(value_type& result_):
+    init_value(value_type() | (~value_type())),result(&result_) {}
+  BAnd(const result_view_type& result_):
+    init_value(value_type() | (~value_type())),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest & src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest & src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct BOr {
+public:
+  //Required
+  typedef BOr reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+public:
+
+  BOr(value_type& result_):
+    init_value(value_type() & (~value_type())),result(&result_) {}
+  BOr(const result_view_type& result_):
+    init_value(value_type() & (~value_type())),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest | src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest | src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Space = HostSpace>
+struct BXor {
+public:
+  //Required
+  typedef BXor reducer_type;
+  typedef typename std::remove_cv<Scalar>::type value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  value_type init_value;
+
+private:
+  result_view_type result;
+
+public:
+
+  BXor(value_type& result_):
+    init_value(value_type() & (~value_type())),result(&result_) {}
+  BXor(const result_view_type& result_):
+    init_value(value_type() & (~value_type())),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+      dest = dest ^ src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest = dest ^ src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Index>
+struct ValLocScalar {
+  Scalar val;
+  Index loc;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const ValLocScalar& rhs) {
+    val = rhs.val;
+    loc = rhs.loc;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile ValLocScalar& rhs) volatile {
+    val = rhs.val;
+    loc = rhs.loc;
+  }
+};
+
+template<class Scalar, class Index, class Space = HostSpace>
+struct MinLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MinLoc reducer_type;
+  typedef ValLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  scalar_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::max();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+public:
+
+  MinLoc(value_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(&result_) {}
+  MinLoc(const result_view_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(result_) {}
+  MinLoc(value_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  MinLoc(const result_view_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.val < dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.val < dest.val )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Index, class Space = HostSpace>
+struct MaxLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MaxLoc reducer_type;
+  typedef ValLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  scalar_type init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct InitWrapper;
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::min();
+    }
+  };
+
+  template<class ValueType >
+  struct InitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+public:
+
+  MaxLoc(value_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(&result_) {}
+  MaxLoc(const result_view_type& result_):
+    init_value(InitWrapper<scalar_type>::value()),result(result_) {}
+  MaxLoc(value_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(&result_) {}
+  MaxLoc(const result_view_type& result_, const scalar_type& init_value_):
+    init_value(init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.val > dest.val )
+      dest = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.val > dest.val )
+      dest = src;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.val = init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<class Scalar, class Index>
+struct MinMaxLocScalar {
+  Scalar min_val,max_val;
+  Index min_loc,max_loc;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const MinMaxLocScalar& rhs) {
+    min_val = rhs.min_val;
+    min_loc = rhs.min_loc;
+    max_val = rhs.max_val;
+    max_loc = rhs.max_loc;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator = (const volatile MinMaxLocScalar& rhs) volatile {
+    min_val = rhs.min_val;
+    min_loc = rhs.min_loc;
+    max_val = rhs.max_val;
+    max_loc = rhs.max_loc;
+  }
+};
+
+template<class Scalar, class Index, class Space = HostSpace>
+struct MinMaxLoc {
+private:
+  typedef typename std::remove_cv<Scalar>::type scalar_type;
+  typedef typename std::remove_cv<Index>::type index_type;
+
+public:
+  //Required
+  typedef MinMaxLoc reducer_type;
+  typedef MinMaxLocScalar<scalar_type,index_type> value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+  scalar_type min_init_value;
+  scalar_type max_init_value;
+
+private:
+  result_view_type result;
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct MinInitWrapper;
+
+  template<class ValueType >
+  struct MinInitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::max();
+    }
+  };
+
+  template<class ValueType >
+  struct MinInitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+  template<class ValueType, bool is_arithmetic = std::is_arithmetic<ValueType>::value >
+  struct MaxInitWrapper;
+
+  template<class ValueType >
+  struct MaxInitWrapper<ValueType,true> {
+    static ValueType value() {
+      return std::numeric_limits<scalar_type>::min();
+    }
+  };
+
+  template<class ValueType >
+  struct MaxInitWrapper<ValueType,false> {
+    static ValueType value() {
+      return scalar_type();
+    }
+  };
+
+public:
+
+  MinMaxLoc(value_type& result_):
+    min_init_value(MinInitWrapper<scalar_type>::value()),max_init_value(MaxInitWrapper<scalar_type>::value()),result(&result_) {}
+  MinMaxLoc(const result_view_type& result_):
+    min_init_value(MinInitWrapper<scalar_type>::value()),max_init_value(MaxInitWrapper<scalar_type>::value()),result(result_) {}
+  MinMaxLoc(value_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_):
+    min_init_value(min_init_value_),max_init_value(max_init_value_),result(&result_) {}
+  MinMaxLoc(const result_view_type& result_, const scalar_type& min_init_value_, const scalar_type& max_init_value_):
+    min_init_value(min_init_value_),max_init_value(max_init_value_),result(result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    if ( src.min_val < dest.min_val ) {
+      dest.min_val = src.min_val;
+      dest.min_loc = src.min_loc;
+    }
+    if ( src.max_val > dest.max_val ) {
+      dest.max_val = src.max_val;
+      dest.max_loc = src.max_loc;
+    }
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val.min_val = min_init_value;
+    val.max_val = max_init_value;
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+}
+}
+
+
+namespace Kokkos {
+namespace Impl {
+
+template< class T, class ReturnType , class ValueTraits>
+struct ParallelReduceReturnValue;
+
+template< class ReturnType , class FunctorType >
+struct ParallelReduceReturnValue<typename std::enable_if<Kokkos::is_view<ReturnType>::value>::type, ReturnType, FunctorType> {
+  typedef ReturnType return_type;
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type_scalar;
+  typedef typename return_type::value_type value_type_array[];
+
+  typedef typename if_c<return_type::rank==0,value_type_scalar,value_type_array>::type value_type;
+
+  static return_type& return_value(ReturnType& return_val, const FunctorType&) {
+    return return_val;
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                   !Kokkos::is_view<ReturnType>::value &&
+                                  (!std::is_array<ReturnType>::value && !std::is_pointer<ReturnType>::value) &&
+                                   !Kokkos::is_reducer_type<ReturnType>::value
+                                 >::type, ReturnType, FunctorType> {
+  typedef Kokkos::View<  ReturnType
+                       , Kokkos::HostSpace
+                       , Kokkos::MemoryUnmanaged
+      > return_type;
+
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type;
+
+  static return_type return_value(ReturnType& return_val, const FunctorType&) {
+    return return_type(&return_val);
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                  (is_array<ReturnType>::value || std::is_pointer<ReturnType>::value)
+                                >::type, ReturnType, FunctorType> {
+  typedef Kokkos::View<  typename std::remove_const<ReturnType>::type
+                       , Kokkos::HostSpace
+                       , Kokkos::MemoryUnmanaged
+      > return_type;
+
+  typedef InvalidType reducer_type;
+
+  typedef typename return_type::value_type value_type[];
+
+  static return_type return_value(ReturnType& return_val,
+                                  const FunctorType& functor) {
+    return return_type(return_val,functor.value_count);
+  }
+};
+
+template< class ReturnType , class FunctorType>
+struct ParallelReduceReturnValue<typename std::enable_if<
+                                   Kokkos::is_reducer_type<ReturnType>::value
+                                >::type, ReturnType, FunctorType> {
+  typedef ReturnType return_type;
+  typedef ReturnType reducer_type;
+  typedef typename return_type::value_type value_type;
+
+  static return_type return_value(ReturnType& return_val,
+                                  const FunctorType& functor) {
+    return return_val;
+  }
+};
+}
+
+namespace Impl {
+template< class T, class ReturnType , class FunctorType>
+struct ParallelReducePolicyType;
+
+template< class PolicyType , class FunctorType >
+struct ParallelReducePolicyType<typename std::enable_if<Kokkos::Impl::is_execution_policy<PolicyType>::value>::type, PolicyType,FunctorType> {
+
+  typedef PolicyType policy_type;
+  static PolicyType policy(const PolicyType& policy_) {
+    return policy_;
+  }
+};
+
+template< class PolicyType , class FunctorType >
+struct ParallelReducePolicyType<typename std::enable_if<std::is_integral<PolicyType>::value>::type, PolicyType,FunctorType> {
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy<execution_space> policy_type;
+
+  static policy_type policy(const PolicyType& policy_) {
+    return policy_type(0,policy_);
+  }
+};
+
+}
+
+namespace Impl {
+  template< class FunctorType, class ExecPolicy, class ValueType, class ExecutionSpace>
+  struct ParallelReduceFunctorType {
+    typedef FunctorType functor_type;
+    static const functor_type& functor(const functor_type& functor) {
+      return functor;
+    }
+  };
+}
+
+namespace Impl {
+
+  template< class PolicyType, class FunctorType, class ReturnType >
+  struct ParallelReduceAdaptor {
+    typedef Impl::ParallelReduceReturnValue<void,ReturnType,FunctorType> return_value_adapter;
+    #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+    typedef Impl::ParallelReduceFunctorType<FunctorType,PolicyType,
+                                            typename return_value_adapter::value_type,
+                                            typename PolicyType::execution_space> functor_adaptor;
+    #endif
+    static inline
+    void execute(const std::string& label,
+        const PolicyType& policy,
+        const FunctorType& functor,
+        ReturnType& return_value) {
+          #if (KOKKOS_ENABLE_PROFILING)
+            uint64_t kpID = 0;
+            if(Kokkos::Profiling::profileLibraryLoaded()) {
+              Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
+            }
+          #endif
+
+          Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
+          #ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
+          Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
+             closure(functor_adaptor::functor(functor),
+                     policy,
+                     return_value_adapter::return_value(return_value,functor));
+          #else
+          Impl::ParallelReduce<FunctorType, PolicyType, typename return_value_adapter::reducer_type >
+             closure(functor,
+                     policy,
+                     return_value_adapter::return_value(return_value,functor));
+          #endif
+          Kokkos::Impl::shared_allocation_tracking_release_and_enable();
+          closure.execute();
+
+          #if (KOKKOS_ENABLE_PROFILING)
+            if(Kokkos::Profiling::profileLibraryLoaded()) {
+              Kokkos::Profiling::endParallelReduce(kpID);
+            }
+          #endif
+        }
+
+  };
+}
+/*! \fn void parallel_reduce(label,policy,functor,return_argument)
+    \brief Perform a parallel reduction.
+    \param label An optional Label giving the call name. Must be able to construct a std::string from the argument.
+    \param policy A Kokkos Execution Policy, such as an integer, a RangePolicy or a TeamPolicy.
+    \param functor A functor with a reduction operator, and optional init, join and final functions.
+    \param return_argument A return argument which can be a scalar, a View, or a ReducerStruct. This argument can be left out if the functor has a final function.
+*/
+
+/** \brief  Parallel reduction
+ *
+ * parallel_reduce performs parallel reductions with arbitrary functions - i.e.
+ * it is not solely data based. The call expects up to 4 arguments:
+ *
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+
+// ReturnValue is scalar or array: take by reference
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value);
+}
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute("",policy,functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute("",policy_type(0,policy),functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor,
+                     ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,ReturnType>::execute(label,policy_type(0,policy),functor,return_value);
+}
+
+// ReturnValue as View or Reducer: take by copy to allow for inline construction
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value);
+}
+
+template< class PolicyType, class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute("",policy,functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,const ReturnType>::execute("",policy_type(0,policy),functor,return_value);
+}
+
+template< class FunctorType, class ReturnType >
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor,
+                     const ReturnType& return_value) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,const ReturnType>::execute(label,policy_type(0,policy),functor,return_value);
+}
+
+// No Return Argument
+
+template< class PolicyType, class FunctorType>
+inline
+void parallel_reduce(const std::string& label,
+                     const PolicyType& policy,
+                     const FunctorType& functor,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute(label,policy,functor,result_view);
+}
+
+template< class PolicyType, class FunctorType >
+inline
+void parallel_reduce(const PolicyType& policy,
+                     const FunctorType& functor,
+                     typename Impl::enable_if<
+                       Kokkos::Impl::is_execution_policy<PolicyType>::value
+                     >::type * = 0) {
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,result_view_type>::execute("",policy,functor,result_view);
+}
+
+template< class FunctorType >
+inline
+void parallel_reduce(const size_t& policy,
+                     const FunctorType& functor) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute("",policy_type(0,policy),functor,result_view);
+}
+
+template< class FunctorType>
+inline
+void parallel_reduce(const std::string& label,
+                     const size_t& policy,
+                     const FunctorType& functor) {
+  typedef typename Impl::ParallelReducePolicyType<void,size_t,FunctorType>::policy_type policy_type;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  typedef Kokkos::View< value_type
+              , Kokkos::HostSpace
+              , Kokkos::MemoryUnmanaged
+              > result_view_type;
+  result_view_type result_view ;
+
+  Impl::ParallelReduceAdaptor<policy_type,FunctorType,result_view_type>::execute(label,policy_type(0,policy),functor,result_view);
+}
+
+
+
+} //namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
index 17654170ed..09a5993863 100644
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -66,11 +66,15 @@ public:
 
 private:
 
-  mutable char * m_iter ;
-  char *         m_end ;
+  mutable char * m_iter_L0 ;
+  char *         m_end_L0 ;
+  mutable char * m_iter_L1 ;
+  char *         m_end_L1 ;
+
 
   mutable int m_multiplier;
   mutable int m_offset;
+  mutable int m_default_level;
 
   ScratchMemorySpace();
   ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
@@ -95,34 +99,58 @@ public:
 
   template< typename IntType >
   KOKKOS_INLINE_FUNCTION
-  void* get_shmem (const IntType& size) const {
-    void* tmp = m_iter + m_offset * align (size);
-    if (m_end < (m_iter += align (size) * m_multiplier)) {
-      m_iter -= align (size) * m_multiplier; // put it back like it was
-  #ifdef KOKKOS_HAVE_DEBUG
-      // mfh 23 Jun 2015: printf call consumes 25 registers
-      // in a CUDA build, so only print in debug mode.  The
-      // function still returns NULL if not enough memory.
-      printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
-              "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
-              long(m_end-m_iter));
-  #endif // KOKKOS_HAVE_DEBUG
-      tmp = 0;
+  void* get_shmem (const IntType& size, int level = -1) const {
+    if(level == -1)
+      level = m_default_level;
+    if(level == 0) {
+      void* tmp = m_iter_L0 + m_offset * align (size);
+      if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
+        m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_HAVE_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L0-m_iter_L0));
+        #endif // KOKKOS_HAVE_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+    } else {
+      void* tmp = m_iter_L1 + m_offset * align (size);
+      if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
+        m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
+        #ifdef KOKKOS_HAVE_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L1-m_iter_L1));
+        #endif // KOKKOS_HAVE_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+
     }
-    return tmp;
   }
 
   template< typename IntType >
   KOKKOS_INLINE_FUNCTION
-  ScratchMemorySpace( void * ptr , const IntType & size )
-    : m_iter( (char *) ptr )
-    , m_end(  m_iter + size )
+  ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
+    : m_iter_L0( (char *) ptr_L0 )
+    , m_end_L0(  m_iter_L0 + size_L0 )
+    , m_iter_L1( (char *) ptr_L1 )
+    , m_end_L1(  m_iter_L1 + size_L1 )
     , m_multiplier( 1 )
     , m_offset( 0 )
+    , m_default_level( 0 )
     {}
 
   KOKKOS_INLINE_FUNCTION
-  const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const {
+  const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
+    m_default_level = level;
     m_multiplier = multiplier;
     m_offset = offset;
     return *this;
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index 656be5d09f..233b56c939 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,12 +50,17 @@
 #include <cstddef>
 #include <iosfwd>
 #include <Kokkos_Parallel.hpp>
+#include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+
+#include <KokkosExp_MDRangePolicy.hpp>
 
 #if defined( KOKKOS_HAVE_SERIAL )
 
@@ -142,7 +147,9 @@ public:
 
     // Init the array of locks used for arbitrarily sized atomics
     Impl::init_lock_array_host_space();
-
+    #if (KOKKOS_ENABLE_PROFILING)
+      Kokkos::Profiling::initialize();
+    #endif
   }
 
   static int is_initialized() { return 1 ; }
@@ -151,7 +158,11 @@ public:
   static int concurrency() {return 1;};
 
   //! Free any resources being consumed by the device.
-  static void finalize() {}
+  static void finalize() {
+    #if (KOKKOS_ENABLE_PROFILING)
+      Kokkos::Profiling::finalize();
+    #endif
+  }
 
   //! Print configuration information to the given output stream.
   static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<
 {
 private:
 
-  size_t m_team_scratch_size ;
-  size_t m_thread_scratch_size ;
+  size_t m_team_scratch_size[2] ;
+  size_t m_thread_scratch_size[2] ;
   int    m_league_size ;
   int    m_chunk_size;
 
@@ -324,8 +335,10 @@ public:
 
   TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
     m_league_size = p.m_league_size;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
     m_chunk_size = p.m_chunk_size;
     return *this;
   }
@@ -348,15 +361,15 @@ public:
 
   inline int team_size() const { return 1 ; }
   inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; }
+  inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
 
   /** \brief  Specify league size, request team size */
   TeamPolicyInternal( execution_space &
             , int league_size_request
             , int /* team_size_request */
             , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_league_size( league_size_request )
     , m_chunk_size ( 32 )
     {}
@@ -365,8 +378,8 @@ public:
             , int league_size_request
             , const Kokkos::AUTO_t & /* team_size_request */
             , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_league_size( league_size_request )
     , m_chunk_size ( 32 )
     {}
@@ -374,8 +387,8 @@ public:
   TeamPolicyInternal( int league_size_request
             , int /* team_size_request */
             , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_league_size( league_size_request )
     , m_chunk_size ( 32 )
     {}
@@ -383,8 +396,8 @@ public:
   TeamPolicyInternal( int league_size_request
             , const Kokkos::AUTO_t & /* team_size_request */
             , int /* vector_length_request */ = 1 )
-    : m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    : m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_league_size( league_size_request )
     , m_chunk_size ( 32 )
     {}
@@ -401,26 +414,23 @@ public:
 
   /** \brief set per team scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
     return p;
   };
 
   /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
   /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
@@ -440,7 +450,7 @@ namespace Kokkos {
 namespace Impl {
 
 template< class FunctorType , class ... Traits >
-class ParallelFor< FunctorType , 
+class ParallelFor< FunctorType ,
                    Kokkos::RangePolicy< Traits ... > ,
                    Kokkos::Serial
                  >
@@ -489,9 +499,10 @@ public:
 
 /*--------------------------------------------------------------------------*/
 
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                     , Kokkos::Serial
                     >
 {
@@ -499,14 +510,19 @@ private:
 
   typedef Kokkos::RangePolicy< Traits ... > Policy ;
   typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
+  const ReducerType   m_reducer ;
   const pointer_type  m_result_ptr ;
 
 
@@ -515,15 +531,15 @@ private:
   typename std::enable_if< std::is_same< TagType , void >::value >::type
   exec( pointer_type ptr ) const
     {
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( i , update );
       }
 
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class TagType >
@@ -532,15 +548,15 @@ private:
   exec( pointer_type ptr ) const
     {
       const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( t , i , update );
       }
 
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
 public:
@@ -549,25 +565,43 @@ public:
   void execute() const
     {
       pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size( m_functor ) , 0 );
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );
 
       this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
     }
 
-  template< class ViewType >
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.ptr_on_device() )
+    {
+      static_assert( Kokkos::is_view< HostViewType >::value
+        , "Kokkos::Serial reduce result must be a View" );
+
+      static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
+        , "Kokkos::Serial reduce result must be a View in HostSpace" );
+    }
+
+  inline
   ParallelReduce( const FunctorType & arg_functor
-                , const Policy      & arg_policy
-                , const ViewType    & arg_result )
+                , Policy       arg_policy
+                , const ReducerType& reducer )
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
-    , m_result_ptr( arg_result.ptr_on_device() )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
     {
-      static_assert( Kokkos::is_view< ViewType >::value
-        , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
-
-      static_assert( std::is_same< typename ViewType::memory_space
+      /*static_assert( std::is_same< typename ViewType::memory_space
                                       , Kokkos::HostSpace >::value
-        , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
     }
 };
 
@@ -697,15 +731,16 @@ public:
              , const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_league(  arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
     { }
 };
 
 /*--------------------------------------------------------------------------*/
 
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                     , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                     , Kokkos::Serial
                     >
 {
@@ -714,30 +749,35 @@ private:
   typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
   typedef typename Policy::member_type                       Member ;
   typedef typename Policy::work_tag                          WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag > ValueInit ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const int          m_league ;
-  const int          m_shared ;
+  const ReducerType  m_reducer ;
         pointer_type m_result_ptr ;
+  const int          m_shared ;
 
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
   exec( pointer_type ptr ) const
     {
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
         m_functor( Member(ileague,m_league,m_shared) , update );
       }
 
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class TagType >
@@ -747,14 +787,14 @@ private:
     {
       const TagType t{} ;
 
-      reference_type update = ValueInit::init( m_functor , ptr );
+      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
         m_functor( t , Member(ileague,m_league,m_shared) , update );
       }
 
-      Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
-        final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
 public:
@@ -763,7 +803,7 @@ public:
   void execute() const
     {
       pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size( m_functor ) , m_shared );
+           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
 
       this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
     }
@@ -771,12 +811,16 @@ public:
   template< class ViewType >
   ParallelReduce( const FunctorType  & arg_functor
                 , const Policy       & arg_policy
-                , const ViewType     & arg_result
-                )
+                , const ViewType     & arg_result ,
+                typename std::enable_if<
+                  Kokkos::is_view< ViewType >::value &&
+                  !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
     : m_functor( arg_functor )
     , m_league( arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
     {
       static_assert( Kokkos::is_view< ViewType >::value
         , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@@ -786,6 +830,21 @@ public:
         , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
     }
 
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_league(  arg_policy.league_size() )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
 };
 
 } // namespace Impl
@@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func
 }
 }
 
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_Serial_Task.hpp>
+
 #endif // defined( KOKKOS_HAVE_SERIAL )
 #endif /* #define KOKKOS_SERIAL_HPP */
 
diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
index 5f999e9a34..fc9113b750 100644
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -1,4 +1,3 @@
-
 /*
 //@HEADER
 // ************************************************************************
@@ -47,13 +46,655 @@
 #ifndef KOKKOS_TASKPOLICY_HPP
 #define KOKKOS_TASKPOLICY_HPP
 
+//----------------------------------------------------------------------------
+
 #include <Kokkos_Core_fwd.hpp>
+
+// If compiling with CUDA then must be using CUDA 8 or better
+// and use relocateable device code to enable the task policy.
+// nvcc relocatable device code option: --relocatable-device-code=true
+
+#if ( defined( KOKKOS_COMPILER_NVCC ) )
+  #if ( 8000 <= CUDA_VERSION ) && \
+      defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
+
+  #define KOKKOS_ENABLE_TASKPOLICY
+
+  #endif
+#else
+
+#define KOKKOS_ENABLE_TASKPOLICY
+
+#endif
+
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Tags.hpp>
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_TaskQueue.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
+              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
+
+enum TaskPriority { TaskHighPriority    = 0
+                  , TaskRegularPriority = 1
+                  , TaskLowPriority     = 2 };
+
+template< typename Space >
+class TaskPolicy ;
+
+template< typename Space >
+void wait( TaskPolicy< Space > const & );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/*\brief  Implementation data for task data management, access, and execution.
+ *
+ *  CRTP Inheritance structure to allow static_cast from the
+ *  task root type and a task's FunctorType.
+ *
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *    TaskBase< Space , ResultType , void >
+ *      : TaskBase< Space , void , void >
+ *      { ... };
+ */
+template< typename Space , typename ResultType , typename FunctorType >
+class TaskBase ;
+
+template< typename Space >
+class TaskExec ;
+
+}} // namespace Kokkos::Impl
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
+class Future {
+private:
+
+  template< typename > friend class TaskPolicy ;
+  template< typename , typename > friend class Future ;
+  template< typename , typename , typename > friend class Impl::TaskBase ;
+
+  enum { Arg1_is_space  = Kokkos::Impl::is_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::Impl::is_space< Arg2 >::value };
+  enum { Arg1_is_value  = ! Arg1_is_space &&
+                          ! std::is_same< Arg1 , void >::value };
+  enum { Arg2_is_value  = ! Arg2_is_space &&
+                          ! std::is_same< Arg2 , void >::value };
+
+  static_assert( ! ( Arg1_is_space && Arg2_is_space )
+               , "Future cannot be given two spaces" );
+
+  static_assert( ! ( Arg1_is_value && Arg2_is_value )
+               , "Future cannot be given two value types" );
+
+  using ValueType =
+    typename std::conditional< Arg1_is_value , Arg1 ,
+    typename std::conditional< Arg2_is_value , Arg2 , void
+    >::type >::type ;
+
+  using Space =
+    typename std::conditional< Arg1_is_space , Arg1 ,
+    typename std::conditional< Arg2_is_space , Arg2 , void
+    >::type >::type ;
+
+  using task_base  = Impl::TaskBase< Space , ValueType , void > ;
+  using queue_type = Impl::TaskQueue< Space > ;
+
+  task_base * m_task ;
+
+  KOKKOS_INLINE_FUNCTION explicit
+  Future( task_base * task ) : m_task(0)
+    { if ( task ) queue_type::assign( & m_task , task ); }
+
+  //----------------------------------------
+
+public:
+
+  using execution_space = typename Space::execution_space ;
+  using value_type      = ValueType ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_task ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int reference_count() const
+    { return 0 != m_task ? m_task->reference_count() : 0 ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr Future() noexcept : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( Future && rhs )
+    : m_task( rhs.m_task ) { rhs.m_task = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future && rhs )
+    {
+      if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    {
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( Future<A1,A2> && rhs )
+    : m_task( rhs.m_task )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      rhs.m_task = 0 ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
+      return *this ;
+    }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( Future<A1,A2> && rhs )
+    {
+      static_assert
+        ( std::is_same< Space , void >::value ||
+          std::is_same< Space , typename Future<A1,A2>::Space >::value
+        , "Assigned Futures must have the same space" );
+
+      static_assert
+        ( std::is_same< value_type , void >::value ||
+          std::is_same< value_type , typename Future<A1,A2>::value_type >::value
+        , "Assigned Futures must have the same value_type" );
+
+      if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
+      m_task = rhs.m_task ;
+      rhs.m_task = 0 ;
+      return *this ;
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  typename task_base::get_return_type
+  get() const
+    {
+      if ( 0 == m_task ) {
+        Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
+      }
+      return m_task->get();
+    }
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ExecSpace >
+class TaskPolicy
+{
+private:
+
+  using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
+  using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
+  using task_base  = Impl::TaskBase< ExecSpace , void , void > ;
+
+  track_type   m_track ;
+  queue_type * m_queue ;
+
+  //----------------------------------------
+  // Process optional arguments to spawn and respawn functions
+
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const ) {}
+
+  // TaskTeam or TaskSingle
+  template< typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , TaskType const & arg
+             , Options const & ... opts )
+    {
+      task->m_task_type = arg ;
+      assign( task , opts ... );
+    }
+
+  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
+  template< typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , TaskPriority const & arg
+             , Options const & ... opts )
+    {
+      task->m_priority = arg ;
+      assign( task , opts ... );
+    }
+
+  // Future for a dependence
+  template< typename A1 , typename A2 , typename ... Options >
+  KOKKOS_INLINE_FUNCTION static
+  void assign( task_base * const task
+             , Future< A1 , A2 > const & arg 
+             , Options const & ... opts )
+    {
+      // Assign dependence to task->m_next
+      // which will be processed within subsequent call to schedule.
+      // Error if the dependence is reset.
+
+      if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
+        Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
+      }
+
+      if ( 0 != arg.m_task ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
+      }
+
+      assign( task , opts ... );
+    }
+
+  //----------------------------------------
+
+public:
+
+  using execution_policy = TaskPolicy ;
+  using execution_space  = ExecSpace ;
+  using memory_space     = typename queue_type::memory_space ;
+  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy() : m_track(), m_queue(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( TaskPolicy && rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( TaskPolicy const & rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
+
+  TaskPolicy( memory_space const & arg_memory_space
+            , unsigned const arg_memory_pool_capacity
+            , unsigned const arg_memory_pool_log2_superblock = 12 )
+    : m_track()
+    , m_queue(0)
+    {
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord
+        < memory_space , typename queue_type::Destroy >
+          record_type ;
+
+      record_type * record =
+        record_type::allocate( arg_memory_space
+                             , "TaskQueue"
+                             , sizeof(queue_type)
+                             );
+
+      m_queue = new( record->data() )
+        queue_type( arg_memory_space
+                  , arg_memory_pool_capacity
+                  , arg_memory_pool_log2_superblock );
+
+      record->m_destroy.m_queue = m_queue ;
+
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  //----------------------------------------
+  /**\brief  Allocation size for a spawned task */
+  template< typename FunctorType >
+  KOKKOS_FUNCTION
+  size_t spawn_allocation_size() const
+    {
+      using task_type  = Impl::TaskBase< execution_space
+                                       , typename FunctorType::value_type
+                                       , FunctorType > ;
+
+      return m_queue->allocate_block_size( sizeof(task_type) );
+    }
+
+  /**\brief  Allocation size for a when_all aggregate */
+  KOKKOS_FUNCTION
+  size_t when_all_allocation_size( int narg ) const
+    {
+      using task_base  = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+
+      return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
+    }
+
+  //----------------------------------------
+
+  /**\brief  A task spawns a task with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   *  3) Team or Serial
+   */
+  template< typename FunctorType , typename ... Options >
+  KOKKOS_FUNCTION
+  Future< typename FunctorType::value_type , ExecSpace >
+  task_spawn( FunctorType const & arg_functor 
+            , Options const & ... arg_options
+            ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      //----------------------------------------
+      // Give single-thread back-ends an opportunity to clear
+      // queue of ready tasks before allocating a new task
+
+      m_queue->iff_single_thread_recursive_execute();
+
+      //----------------------------------------
+
+      future_type f ;
+
+      // Allocate task from memory pool
+      f.m_task =
+        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
 
+      if ( f.m_task ) {
+
+        // Placement new construction
+        new ( f.m_task ) task_type( arg_functor );
+
+        // Reference count starts at two
+        // +1 for matching decrement when task is complete
+        // +1 for future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = sizeof(task_type);
+
+        assign( f.m_task , arg_options... );
+
+        // Spawning from within the execution space so the
+        // apply function pointer is guaranteed to be valid
+        f.m_task->m_apply = task_type::apply ;
+
+        m_queue->schedule( f.m_task );
+        // this task may be updated or executed at any moment
+      }
+
+      return f ;
+    }
+
+  /**\brief  The host process spawns a task with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   *  3) Team or Serial
+   */
+  template< typename FunctorType , typename ... Options >
+  inline
+  Future< typename FunctorType::value_type , ExecSpace >
+  host_spawn( FunctorType const & arg_functor 
+            , Options const & ... arg_options
+            ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using future_type = Future< value_type , execution_space > ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      future_type f ;
+
+      // Allocate task from memory pool
+      f.m_task = 
+        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
+
+      if ( f.m_task ) {
+
+        // Placement new construction
+        new( f.m_task ) task_type( arg_functor );
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = sizeof(task_type);
+
+        assign( f.m_task , arg_options... );
+
+        // Potentially spawning outside execution space so the
+        // apply function pointer must be obtained from execution space.
+        // Required for Cuda execution space function pointer.
+        queue_type::specialization::template
+          proc_set_apply< FunctorType >( & f.m_task->m_apply );
+
+        m_queue->schedule( f.m_task );
+      }
+      return f ;
+    }
+
+  /**\brief  Return a future that is complete
+   *         when all input futures are complete.
+   */
+  template< typename A1 , typename A2 >
+  KOKKOS_FUNCTION
+  Future< ExecSpace >
+  when_all( int narg , Future< A1 , A2 > const * const arg ) const
+    {
+      static_assert
+        ( std::is_same< execution_space
+                      , typename Future< A1 , A2 >::execution_space
+                      >::value
+        , "Future must have same execution space" );
+
+      using future_type = Future< ExecSpace > ;
+      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+
+      future_type f ;
+
+      size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
+
+      f.m_task =
+        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
+
+      if ( f.m_task ) {
+
+        new( f.m_task ) task_base();
+
+        // Reference count starts at two:
+        // +1 to match decrement when task completes
+        // +1 for the future
+        f.m_task->m_queue      = m_queue ;
+        f.m_task->m_ref_count  = 2 ;
+        f.m_task->m_alloc_size = size ;
+        f.m_task->m_dep_count  = narg ;
+        f.m_task->m_task_type  = task_base::Aggregate ;
+
+        task_base ** const dep = f.m_task->aggregate_dependences();
+
+        // Assign dependences to increment their reference count
+        // The futures may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+
+        for ( int i = 0 ; i < narg ; ++i ) {
+          task_base * const t = dep[i] = arg[i].m_task ;
+          if ( 0 != t ) {
+            Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
+          }
+        }
+
+        m_queue->schedule( f.m_task );
+        // this when_all may be processed at any moment
+      }
+
+      return f ;
+    }
+
+  /**\brief  An executing task respawns itself with options
+   *
+   *  1) High, Normal, or Low priority
+   *  2) With or without dependence
+   */
+  template< class FunctorType , typename ... Options >
+  KOKKOS_FUNCTION
+  void respawn( FunctorType * task_self
+              , Options const & ... arg_options ) const
+    {
+      using value_type  = typename FunctorType::value_type ;
+      using task_type   = Impl::TaskBase< execution_space
+                                        , value_type
+                                        , FunctorType > ;
+
+      task_base * const zero = (task_base *) 0 ;
+      task_base * const lock = (task_base *) task_base::LockTag ;
+      task_type * const task = static_cast< task_type * >( task_self );
+
+      // Precondition:
+      //   task is in Executing state
+      //   therefore  m_next == LockTag
+      //
+      // Change to m_next == 0 for no dependence
+
+      if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+        Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
+      }
+
+      assign( task , arg_options... );
+
+      // Postcondition:
+      //   task is in Executing-Respawn state
+      //   therefore  m_next == dependece or 0
+    }
+
+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
+
+  //----------------------------------------
+
+  inline
+  int allocation_capacity() const noexcept
+    { return m_queue->m_memory.get_mem_size(); }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const noexcept
+    { return m_queue->m_count_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count_max() const noexcept
+    { return m_queue->m_max_alloc ; }
+
+  KOKKOS_INLINE_FUNCTION
+  long allocated_task_count_accum() const noexcept
+    { return m_queue->m_accum_alloc ; }
+
+};
+
+template< typename ExecSpace >
+inline
+void wait( TaskPolicy< ExecSpace > const & policy )
+{ policy.m_queue->execute(); }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & );
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#endif /* #define KOKKOS_TASKPOLICY_HPP */
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */
 
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
index 23efefce34..c9ebbf9265 100644
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Threads/Kokkos_ThreadsTeam.hpp>
 #include <Threads/Kokkos_Threads_Parallel.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
index 2f98b4dfa5..1cc8b03381 100644
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -46,69 +46,61 @@
 
 #include <type_traits>
 #include <string>
-#include <Kokkos_Core_fwd.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
+#include <algorithm>
+#include <initializer_list>
 
+#include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Shape.hpp>
-#include <impl/Kokkos_AnalyzeShape.hpp>
-#include <impl/Kokkos_Tags.hpp>
-
-// Must define before includng <impl/Kokkos_ViewOffset.hpp>
-namespace Kokkos { struct ALL ; }
-
-#include <impl/Kokkos_ViewOffset.hpp>
-#include <impl/Kokkos_ViewSupport.hpp>
+#include <Kokkos_ExecPolicy.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+namespace Experimental {
 namespace Impl {
 
-/** \brief  View specialization mapping of view traits to a specialization tag */
-template< class ValueType ,
-          class ArraySpecialize ,
-          class ArrayLayout ,
-          class MemorySpace ,
-          class MemoryTraits >
-struct ViewSpecialize ;
-
-/** \brief  Defines the type of a subview given a source view type
- *          and subview argument types.
- */
-template< class SrcViewType
-        , class Arg0Type
-        , class Arg1Type
-        , class Arg2Type
-        , class Arg3Type
-        , class Arg4Type
-        , class Arg5Type
-        , class Arg6Type
-        , class Arg7Type
-        >
-struct ViewSubview /* { typedef ... type ; } */ ;
+template< class DstMemorySpace , class SrcMemorySpace >
+struct DeepCopy ;
 
-template< class DstViewSpecialize ,
-          class SrcViewSpecialize = void ,
-          class Enable = void >
-struct ViewAssignment ;
+template< class DataType >
+struct ViewArrayAnalysis ;
 
-template< class DstMemorySpace , class SrcMemorySpace , class ExecutionSpace>
-struct DeepCopy ;
+template< class DataType , class ArrayLayout
+        , typename ValueType =
+          typename ViewArrayAnalysis< DataType >::non_const_value_type
+        >
+struct ViewDataAnalysis ;
+
+template< class , class ... >
+class ViewMapping { public: enum { is_assignable = false }; };
+
+template< class MemorySpace >
+struct ViewOperatorBoundsErrorAbort ;
+
+template<>
+struct ViewOperatorBoundsErrorAbort< Kokkos::HostSpace > {
+  static void apply( const size_t rank
+                   , const size_t n0 , const size_t n1
+                   , const size_t n2 , const size_t n3
+                   , const size_t n4 , const size_t n5
+                   , const size_t n6 , const size_t n7
+                   , const size_t i0 , const size_t i1
+                   , const size_t i2 , const size_t i3
+                   , const size_t i4 , const size_t i5
+                   , const size_t i6 , const size_t i7 );
+};
 
 } /* namespace Impl */
-} // namespace Kokkos
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+namespace Experimental {
 
 /** \class ViewTraits
  *  \brief Traits class for accessing attributes of a View.
@@ -116,247 +108,189 @@ namespace Kokkos {
  * This is an implementation detail of View.  It is only of interest
  * to developers implementing a new specialization of View.
  *
- * Template argument permutations:
- *   - View< DataType , void         , void         , void >
- *   - View< DataType , Space        , void         , void >
- *   - View< DataType , Space        , MemoryTraits , void >
- *   - View< DataType , Space        , void         , MemoryTraits >
- *   - View< DataType , ArrayLayout  , void         , void >
- *   - View< DataType , ArrayLayout  , Space        , void >
- *   - View< DataType , ArrayLayout  , MemoryTraits , void   >
- *   - View< DataType , ArrayLayout  , Space        , MemoryTraits >
- *   - View< DataType , MemoryTraits , void         , void  >
+ * Template argument options:
+ *   - View< DataType >
+ *   - View< DataType , Space >
+ *   - View< DataType , Space , MemoryTraits >
+ *   - View< DataType , ArrayLayout >
+ *   - View< DataType , ArrayLayout , Space >
+ *   - View< DataType , ArrayLayout , MemoryTraits >
+ *   - View< DataType , ArrayLayout , Space , MemoryTraits >
+ *   - View< DataType , MemoryTraits >
  */
 
-template< class DataType ,
-          class Arg1 = void ,
-          class Arg2 = void ,
-          class Arg3 = void >
-class ViewTraits {
-private:
+template< class DataType , class ... Properties >
+struct ViewTraits ;
+
+template<>
+struct ViewTraits< void >
+{
+  typedef void  execution_space ;
+  typedef void  memory_space ;
+  typedef void  HostMirrorSpace ;
+  typedef void  array_layout ;
+  typedef void  memory_traits ;
+};
+
+template< class ... Prop >
+struct ViewTraits< void , void , Prop ... >
+{
+  // Ignore an extraneous 'void'
+  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
+  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
+  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
+  typedef typename ViewTraits<void,Prop...>::array_layout     array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
+};
+
+template< class ArrayLayout , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_array_layout<ArrayLayout>::value >::type , ArrayLayout , Prop ... >
+{
+  // Specify layout, keep subsequent space and memory traits arguments
+
+  typedef typename ViewTraits<void,Prop...>::execution_space  execution_space ;
+  typedef typename ViewTraits<void,Prop...>::memory_space     memory_space ;
+  typedef typename ViewTraits<void,Prop...>::HostMirrorSpace  HostMirrorSpace ;
+  typedef          ArrayLayout                                array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits    memory_traits ;
+};
+
+template< class Space , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_space<Space>::value >::type , Space , Prop ... >
+{
+  // Specify Space, memory traits should be the only subsequent argument.
+
+  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::HostMirrorSpace , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value
+               , "Only one View Execution or Memory Space template argument" );
+
+  typedef typename Space::execution_space                   execution_space ;
+  typedef typename Space::memory_space                      memory_space ;
+  typedef typename Kokkos::Impl::is_space< Space >::host_mirror_space
+      HostMirrorSpace ;
+  typedef typename execution_space::array_layout            array_layout ;
+  typedef typename ViewTraits<void,Prop...>::memory_traits  memory_traits ;
+};
+
+template< class MemoryTraits , class ... Prop >
+struct ViewTraits< typename std::enable_if< Kokkos::Impl::is_memory_traits<MemoryTraits>::value >::type , MemoryTraits , Prop ... >
+{
+  // Specify memory trait, should not be any subsequent arguments
+
+  static_assert( std::is_same< typename ViewTraits<void,Prop...>::execution_space , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_space    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::array_layout    , void >::value &&
+                 std::is_same< typename ViewTraits<void,Prop...>::memory_traits   , void >::value
+               , "MemoryTrait is the final optional template argument for a View" );
+
+  typedef void          execution_space ;
+  typedef void          memory_space ;
+  typedef void          HostMirrorSpace ;
+  typedef void          array_layout ;
+  typedef MemoryTraits  memory_traits ;
+};
 
-  // Layout, Space, and MemoryTraits are optional
-  // but need to appear in that order. That means Layout
-  // can only be Arg1, Space can be Arg1 or Arg2, and
-  // MemoryTraits can be Arg1, Arg2 or Arg3
 
-  enum { Arg1IsLayout = Impl::is_array_layout<Arg1>::value };
+template< class DataType , class ... Properties >
+struct ViewTraits {
+private:
 
-  enum { Arg1IsSpace = Impl::is_space<Arg1>::value };
-  enum { Arg2IsSpace = Impl::is_space<Arg2>::value };
+  // Unpack the properties arguments
+  typedef ViewTraits< void , Properties ... >  prop ;
 
-  enum { Arg1IsMemoryTraits = Impl::is_memory_traits<Arg1>::value };
-  enum { Arg2IsMemoryTraits = Impl::is_memory_traits<Arg2>::value };
-  enum { Arg3IsMemoryTraits = Impl::is_memory_traits<Arg3>::value };
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::execution_space , void >::value
+                    , typename prop::execution_space
+                    , Kokkos::DefaultExecutionSpace
+                    >::type
+      ExecutionSpace ;
 
-  enum { Arg1IsVoid = Impl::is_same< Arg1 , void >::value };
-  enum { Arg2IsVoid = Impl::is_same< Arg2 , void >::value };
-  enum { Arg3IsVoid = Impl::is_same< Arg3 , void >::value };
+  typedef typename
+    std::conditional< ! std::is_same< typename prop::memory_space , void >::value
+                    , typename prop::memory_space
+                    , typename ExecutionSpace::memory_space
+                    >::type
+      MemorySpace ;
 
-  // Arg1 is Layout, Space, MemoryTraits, or void
   typedef typename
-    Impl::StaticAssert<
-      ( 1 == Arg1IsLayout + Arg1IsSpace + Arg1IsMemoryTraits + Arg1IsVoid )
-      , Arg1 >::type Arg1Verified ;
-
-  // If Arg1 is Layout       then Arg2 is Space, MemoryTraits, or void
-  // If Arg1 is Space        then Arg2 is MemoryTraits or void
-  // If Arg1 is MemoryTraits then Arg2 is void
-  // If Arg1 is Void         then Arg2 is void
+    std::conditional< ! std::is_same< typename prop::array_layout , void >::value
+                    , typename prop::array_layout
+                    , typename ExecutionSpace::array_layout
+                    >::type
+      ArrayLayout ;
+
   typedef typename
-    Impl::StaticAssert<
-      ( Arg1IsLayout       && ( 1 == Arg2IsSpace + Arg2IsMemoryTraits + Arg2IsVoid ) ) ||
-      ( Arg1IsSpace        && ( 0 == Arg2IsSpace ) && ( 1 == Arg2IsMemoryTraits + Arg2IsVoid ) ) ||
-      ( Arg1IsMemoryTraits && Arg2IsVoid ) ||
-      ( Arg1IsVoid         && Arg2IsVoid )
-      , Arg2 >::type Arg2Verified ;
-
-  // Arg3 is MemoryTraits or void and at most one argument is MemoryTraits
+    std::conditional
+      < ! std::is_same< typename prop::HostMirrorSpace , void >::value
+      , typename prop::HostMirrorSpace
+      , typename Kokkos::Impl::is_space< ExecutionSpace >::host_mirror_space
+      >::type
+      HostMirrorSpace ;
+
   typedef typename
-    Impl::StaticAssert<
-      ( 1 == Arg3IsMemoryTraits + Arg3IsVoid ) &&
-      ( Arg1IsMemoryTraits + Arg2IsMemoryTraits + Arg3IsMemoryTraits <= 1 )
-      , Arg3 >::type Arg3Verified ;
-
-  // Arg1 or Arg2 may have execution and memory spaces
-  typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified ,
-          typename Impl::if_c<( Arg2IsSpace ), Arg2Verified ,
-          Kokkos::DefaultExecutionSpace
-          >::type >::type::execution_space  ExecutionSpace ;
-
-  typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified ,
-          typename Impl::if_c<( Arg2IsSpace ), Arg2Verified ,
-          Kokkos::DefaultExecutionSpace
-          >::type >::type::memory_space  MemorySpace ;
-
-  typedef typename Impl::is_space<
-    typename Impl::if_c<( Arg1IsSpace ), Arg1Verified ,
-    typename Impl::if_c<( Arg2IsSpace ), Arg2Verified ,
-    Kokkos::DefaultExecutionSpace
-    >::type >::type >::host_mirror_space  HostMirrorSpace ;
-
-  // Arg1 may be array layout
-  typedef typename Impl::if_c< Arg1IsLayout , Arg1Verified ,
-          typename ExecutionSpace::array_layout
-          >::type ArrayLayout ;
-
-  // Arg1, Arg2, or Arg3 may be memory traits
-  typedef typename Impl::if_c< Arg1IsMemoryTraits , Arg1Verified ,
-          typename Impl::if_c< Arg2IsMemoryTraits , Arg2Verified ,
-          typename Impl::if_c< Arg3IsMemoryTraits , Arg3Verified ,
-          MemoryManaged
-          >::type >::type >::type  MemoryTraits ;
-
-  typedef Impl::AnalyzeShape<DataType> analysis ;
+    std::conditional< ! std::is_same< typename prop::memory_traits , void >::value
+                    , typename prop::memory_traits
+                    , typename Kokkos::MemoryManaged
+                    >::type
+      MemoryTraits ;
+
+  // Analyze data type's properties,
+  // May be specialized based upon the layout and value type
+  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType , ArrayLayout > data_analysis ;
 
 public:
 
   //------------------------------------
   // Data type traits:
 
-  typedef DataType                            data_type ;
-  typedef typename analysis::const_type       const_data_type ;
-  typedef typename analysis::non_const_type   non_const_data_type ;
+  typedef typename data_analysis::type            data_type ;
+  typedef typename data_analysis::const_type      const_data_type ;
+  typedef typename data_analysis::non_const_type  non_const_data_type ;
 
   //------------------------------------
-  // Array of intrinsic scalar type traits:
+  // Compatible array of trivial type traits:
 
-  typedef typename analysis::array_intrinsic_type            array_intrinsic_type ;
-  typedef typename analysis::const_array_intrinsic_type      const_array_intrinsic_type ;
-  typedef typename analysis::non_const_array_intrinsic_type  non_const_array_intrinsic_type ;
+  typedef typename data_analysis::scalar_array_type            scalar_array_type ;
+  typedef typename data_analysis::const_scalar_array_type      const_scalar_array_type ;
+  typedef typename data_analysis::non_const_scalar_array_type  non_const_scalar_array_type ;
 
   //------------------------------------
   // Value type traits:
 
-  typedef typename analysis::value_type            value_type ;
-  typedef typename analysis::const_value_type      const_value_type ;
-  typedef typename analysis::non_const_value_type  non_const_value_type ;
+  typedef typename data_analysis::value_type            value_type ;
+  typedef typename data_analysis::const_value_type      const_value_type ;
+  typedef typename data_analysis::non_const_value_type  non_const_value_type ;
 
   //------------------------------------
-  // Layout and shape traits:
+  // Mapping traits:
 
-  typedef ArrayLayout                array_layout ;
-  typedef typename analysis::shape   shape_type ;
+  typedef ArrayLayout                         array_layout ;
+  typedef typename data_analysis::dimension   dimension ;
+  typedef typename data_analysis::specialize  specialize /* mapping specialization tag */ ;
 
-  enum { rank         = shape_type::rank };
-  enum { rank_dynamic = shape_type::rank_dynamic };
+  enum { rank         = dimension::rank };
+  enum { rank_dynamic = dimension::rank_dynamic };
 
   //------------------------------------
   // Execution space, memory space, memory access traits, and host mirror space.
 
-  typedef ExecutionSpace   execution_space ;
-  typedef MemorySpace      memory_space ;
-  typedef Device<ExecutionSpace,MemorySpace>  device_type ;
-  typedef MemoryTraits     memory_traits ;
-  typedef HostMirrorSpace  host_mirror_space ;
+  typedef ExecutionSpace                              execution_space ;
+  typedef MemorySpace                                 memory_space ;
+  typedef Kokkos::Device<ExecutionSpace,MemorySpace>  device_type ;
+  typedef MemoryTraits                                memory_traits ;
+  typedef HostMirrorSpace                             host_mirror_space ;
 
-  typedef typename memory_space::size_type  size_type ;
+  typedef typename MemorySpace::size_type  size_type ;
 
-  enum { is_hostspace      = Impl::is_same< memory_space , HostSpace >::value };
-  enum { is_managed        = memory_traits::Unmanaged == 0 };
-  enum { is_random_access  = memory_traits::RandomAccess == 1 };
+  enum { is_hostspace      = std::is_same< MemorySpace , HostSpace >::value };
+  enum { is_managed        = MemoryTraits::Unmanaged    == 0 };
+  enum { is_random_access  = MemoryTraits::RandomAccess == 1 };
 
   //------------------------------------
-
-
-  //------------------------------------
-  // Specialization tag:
-
-  typedef typename
-    Impl::ViewSpecialize< value_type
-                        , typename analysis::specialize
-                        , array_layout
-                        , memory_space
-                        , memory_traits
-                        >::type specialize ;
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class ViewDefault {};
-
-/** \brief  Default view specialization has LayoutLeft, LayoutRight, or LayoutStride.
- */
-template< class ValueType , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ValueType , void , LayoutLeft , MemorySpace , MemoryTraits >
-{ typedef ViewDefault type ; };
-
-template< class ValueType , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ValueType , void , LayoutRight , MemorySpace , MemoryTraits >
-{ typedef ViewDefault type ; };
-
-template< class ValueType , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ValueType , void , LayoutStride , MemorySpace , MemoryTraits >
-{ typedef ViewDefault type ; };
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief Types for compile-time detection of View usage errors */
-namespace ViewError {
-
-struct allocation_constructor_requires_managed {};
-struct allocation_constructor_requires_nonconst {};
-struct user_pointer_constructor_requires_unmanaged {};
-struct device_shmem_constructor_requires_unmanaged {};
-
-struct scalar_operator_called_from_non_scalar_view {};
-
-} /* namespace ViewError */
-
-//----------------------------------------------------------------------------
-/** \brief  Enable view parentheses operator for
- *          match of layout and integral arguments.
- *          If correct rank define type from traits,
- *          otherwise define type as an error message.
- */
-template< class ReturnType , class Traits , class Layout , unsigned Rank ,
-          typename iType0 = int , typename iType1 = int ,
-          typename iType2 = int , typename iType3 = int ,
-          typename iType4 = int , typename iType5 = int ,
-          typename iType6 = int , typename iType7 = int ,
-          class Enable = void >
-struct ViewEnableArrayOper ;
-
-template< class ReturnType , class Traits , class Layout , unsigned Rank ,
-          typename iType0 , typename iType1 ,
-          typename iType2 , typename iType3 ,
-          typename iType4 , typename iType5 ,
-          typename iType6 , typename iType7 >
-struct ViewEnableArrayOper<
-   ReturnType , Traits , Layout , Rank ,
-   iType0 , iType1 , iType2 , iType3 ,
-   iType4 , iType5 , iType6 , iType7 ,
-   typename enable_if<
-     iType0(0) == 0 && iType1(0) == 0 && iType2(0) == 0 && iType3(0) == 0 &&
-     iType4(0) == 0 && iType5(0) == 0 && iType6(0) == 0 && iType7(0) == 0 &&
-     is_same< typename Traits::array_layout , Layout >::value &&
-     ( unsigned(Traits::rank) == Rank )
-   >::type >
-{
-  typedef ReturnType type ;
 };
 
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
 /** \class View
  *  \brief View to an array of data.
  *
@@ -376,11 +310,13 @@ namespace Kokkos {
  * they may occur.
  *
  * Valid ways in which template arguments may be specified:
- *   - View< DataType , Space >
- *   - View< DataType , Space  ,         MemoryTraits >
- *   - View< DataType , Space  , void  , MemoryTraits >
+ *   - View< DataType >
+ *   - View< DataType , Layout >
  *   - View< DataType , Layout , Space >
  *   - View< DataType , Layout , Space , MemoryTraits >
+ *   - View< DataType , Space >
+ *   - View< DataType , Space , MemoryTraits >
+ *   - View< DataType , MemoryTraits >
  *
  * \tparam DataType (required) This indicates both the type of each
  *   entry of the array, and the combination of compile-time and
@@ -437,1194 +373,1425 @@ namespace Kokkos {
  * }
  * \endcode
  */
-template< class DataType ,
-          class Arg1Type = void , /* ArrayLayout, SpaceType, or MemoryTraits */
-          class Arg2Type = void , /* SpaceType or MemoryTraits */
-          class Arg3Type = void , /* MemoryTraits */
-          class Specialize =
-            typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize >
+template< class DataType , class ... Properties >
 class View ;
 
-template< class C >
-struct is_view : public Impl::bool_< false > {};
-
-template< class D , class A1 , class A2 , class A3 , class S >
-struct is_view< View< D , A1 , A2 , A3 , S > > : public Impl::bool_< true > {};
-
-namespace Impl {
-using Kokkos::is_view ;
-}
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          class Arg3Type >
-class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewDefault >
-  : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
-{
-public:
+#include <impl/KokkosExp_ViewMapping.hpp>
+#include <impl/KokkosExp_ViewArray.hpp>
 
-  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
-private:
+namespace Kokkos {
+namespace Experimental {
 
-  // Assignment of compatible views requirement:
-  template< class , class , class , class , class > friend class View ;
+namespace {
 
-  // Assignment of compatible subview requirement:
-  template< class , class , class > friend struct Impl::ViewAssignment ;
+constexpr Kokkos::Experimental::Impl::ALL_t
+  ALL = Kokkos::Experimental::Impl::ALL_t();
 
-  // Dimensions, cardinality, capacity, and offset computation for
-  // multidimensional array view of contiguous memory.
-  // Inherits from Impl::Shape
-  typedef Impl::ViewOffset< typename traits::shape_type
-                          , typename traits::array_layout
-                          > offset_map_type ;
+constexpr Kokkos::Experimental::Impl::WithoutInitializing_t
+  WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t();
 
-  // Intermediary class for data management and access
-  typedef Impl::ViewDataManagement< traits > view_data_management ;
+constexpr Kokkos::Experimental::Impl::AllowPadding_t
+  AllowPadding        = Kokkos::Experimental::Impl::AllowPadding_t();
 
-  //----------------------------------------
-  // Data members:
+}
 
-  typename view_data_management::handle_type  m_ptr_on_device ;
-  offset_map_type                             m_offset_map ;
-  view_data_management                        m_management ;
-  Impl::AllocationTracker                     m_tracker ;
+/** \brief  Create View allocation parameter bundle from argument list.
+ *
+ *  Valid argument list members are:
+ *    1) label as a "string" or std::string
+ *    2) memory space instance of the View::memory_space type
+ *    3) execution space instance compatible with the View::memory_space
+ *    4) Kokkos::WithoutInitializing to bypass initialization
+ *    4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment
+ */
+template< class ... Args >
+inline
+Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+view_alloc( Args const & ... args )
+{
+  typedef
+    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+      return_type ;
 
-  //----------------------------------------
+  static_assert( ! return_type::has_pointer
+               , "Cannot give pointer-to-memory for view allocation" );
 
-public:
+  return return_type( args... );
+}
 
-  /** return type for all indexing operators */
-  typedef typename view_data_management::return_type reference_type ;
+template< class ... Args >
+inline
+Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+view_wrap( Args const & ... args )
+{
+  typedef
+    Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
+      return_type ;
 
-  enum { reference_type_is_lvalue = view_data_management::ReturnTypeIsReference };
+  static_assert( ! return_type::has_memory_space &&
+                 ! return_type::has_execution_space &&
+                 ! return_type::has_label &&
+                 return_type::has_pointer
+               , "Must only give pointer-to-memory for view wrapping" );
 
-  typedef View< typename traits::array_intrinsic_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > array_type ;
+  return return_type( args... );
+}
 
-  typedef View< typename traits::const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > const_type ;
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > non_const_type ;
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::host_mirror_space ,
-                void > HostMirror ;
+namespace Kokkos {
+namespace Experimental {
 
-  //------------------------------------
-  // Shape
+template< class DataType , class ... Properties >
+class View ;
 
-  enum { Rank = traits::rank };
+template< class > struct is_view : public std::false_type {};
 
-  KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { return m_offset_map.cardinality(); }
+template< class D, class ... P >
+struct is_view< View<D,P...> > : public std::true_type {};
 
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type dimension( const iType & i ) const
-    { return Impl::dimension( m_offset_map , i ); }
+template< class D, class ... P >
+struct is_view< const View<D,P...> > : public std::true_type {};
 
-  //------------------------------------
-  // Destructor, constructors, assignment operators:
+template< class DataType , class ... Properties >
+class View : public ViewTraits< DataType , Properties ... > {
+private:
 
-  KOKKOS_INLINE_FUNCTION
-  ~View() {}
+  template< class , class ... > friend class View ;
+  template< class , class ... > friend class Impl::ViewMapping ;
 
-  KOKKOS_INLINE_FUNCTION
-  View()
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    { m_offset_map.assign(0, 0,0,0,0,0,0,0,0); }
+public:
 
-  KOKKOS_INLINE_FUNCTION
-  View( const View & rhs )
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      (void) Impl::ViewAssignment<
-         typename traits::specialize ,
-         typename traits::specialize >( *this , rhs );
-    }
+  typedef ViewTraits< DataType , Properties ... > traits ;
 
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View & rhs )
-    {
-      (void) Impl::ViewAssignment<
-         typename traits::specialize ,
-         typename traits::specialize >( *this , rhs );
-      return *this ;
-    }
+private:
 
-  //------------------------------------
-  // Construct or assign compatible view:
+  typedef Kokkos::Experimental::Impl::ViewMapping< traits , void > map_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker      track_type ;
 
-  template< class RT , class RL , class RD , class RM , class RS >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<RT,RL,RD,RM,RS> & rhs )
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      (void) Impl::ViewAssignment<
-         typename traits::specialize , RS >( *this , rhs );
-    }
+  track_type  m_track ;
+  map_type    m_map ;
 
-  template< class RT , class RL , class RD , class RM , class RS >
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
-    {
-      (void) Impl::ViewAssignment<
-         typename traits::specialize , RS >( *this , rhs );
-      return *this ;
-    }
+public:
 
-  //------------------------------------
-  /**\brief Allocation of a managed view with possible alignment padding.
-   *
-   *  Allocation properties for allocating and initializing to the default value_type:
-   *    Kokkos::ViewAllocate()
-   *    Kokkos::ViewAllocate("label")  OR  "label"
-   *    Kokkos::ViewAllocate(std::string("label"))  OR  std::string("label")
-   *
-   *  Allocation properties for allocating and bypassing initialization:
-   *    Kokkos::ViewAllocateWithoutInitializing()
-   *    Kokkos::ViewAllocateWithoutInitializing("label")
-   */
+  //----------------------------------------
+  /** \brief  Compatible view of array of scalar types */
+  typedef View< typename traits::scalar_array_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    array_type ;
 
-  template< class AllocationProperties >
-  explicit inline
-  View( const AllocationProperties & prop ,
-        // Impl::ViewAllocProp::size_type exists when the traits and allocation properties
-        // are valid for allocating viewed memory.
-        const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 ,
-        const size_t n8 = 0 )
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ;
+  /** \brief  Compatible view of const data type */
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    const_type ;
 
-      static_assert(!std::is_same<typename traits::array_layout, LayoutStride>::value,
-                         "LayoutStride does not support View constructor which takes dimensions directly!");
+  /** \brief  Compatible view of non-const data type */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits >
+    non_const_type ;
 
-      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
-      if(Alloc::AllowPadding)
-        m_offset_map.set_padding();
+  /** \brief  Compatible HostMirror view */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::host_mirror_space >
+    HostMirror ;
 
-      m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker );
+  //----------------------------------------
+  // Domain rank and extents
 
-    }
+  enum { Rank = map_type::Rank };
 
-  template< class AllocationProperties >
-  explicit inline
-  View( const AllocationProperties & prop ,
-        const typename traits::array_layout & layout ,
-        // Impl::ViewAllocProp::size_type exists when the traits and allocation properties
-        // are valid for allocating viewed memory.
-        const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type = 0 )
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ;
+ /** \brief rank() to be implemented
+  */
+  //KOKKOS_INLINE_FUNCTION
+  //static
+  //constexpr unsigned rank() { return map_type::Rank; }
 
-      m_offset_map.assign( layout );
-      if(Alloc::AllowPadding)
-        m_offset_map.set_padding();
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  extent( const iType & r ) const
+    { return m_map.extent(r); }
 
-      m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker );
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , int >::type
+  extent_int( const iType & r ) const
+    { return static_cast<int>(m_map.extent(r)); }
 
-      m_management.set_noncontiguous();
-    }
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename traits::array_layout layout() const
+    { return m_map.layout(); }
 
-  //------------------------------------
-  // Assign an unmanaged View from pointer, can be called in functors.
-  // No alignment padding is performed.
+  //----------------------------------------
+  /*  Deprecate all 'dimension' functions in favor of
+   *  ISO/C++ vocabulary 'extent'.
+   */
 
-  template< class Type >
-  explicit KOKKOS_INLINE_FUNCTION
-  View( Type * ptr ,
-        typename Impl::ViewRawPointerProp< traits , Type >::size_type n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 ,
-        const size_t n8 = 0 )
-    : m_ptr_on_device(ptr)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
-      m_management.set_unmanaged();
-    }
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION constexpr
+  typename std::enable_if< std::is_integral<iType>::value , size_t >::type
+  dimension( const iType & r ) const { return extent( r ); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
 
-  template< class Type >
-  explicit KOKKOS_INLINE_FUNCTION
-  View( Type * ptr ,
-        typename traits::array_layout const & layout ,
-        typename Impl::ViewRawPointerProp< traits , Type >::size_type = 0 )
-    : m_ptr_on_device(ptr)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      m_offset_map.assign( layout );
-      m_management.set_unmanaged();
-      m_management.set_noncontiguous();
-    }
+  //----------------------------------------
 
+  KOKKOS_INLINE_FUNCTION constexpr size_t size() const { return m_map.dimension_0() *
+                                                                m_map.dimension_1() *
+                                                                m_map.dimension_2() *
+                                                                m_map.dimension_3() *
+                                                                m_map.dimension_4() *
+                                                                m_map.dimension_5() *
+                                                                m_map.dimension_6() *
+                                                                m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
 
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION void stride( iType * const s ) const { m_map.stride(s); }
 
-  //------------------------------------
-  // Assign a View from an AllocationTracker,
-  // The allocator used must be compatiable with the memory space of the view
-  // No alignment padding is performed.
-  // TODO: Should these allow padding??? DJS 01/15/15
-  explicit
-  View( Impl::AllocationTracker const &arg_tracker ,
-        const size_t n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 ,
-        const size_t n8 = 0 )
-    : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr()))
-    , m_offset_map()
-    , m_management()
-    , m_tracker(arg_tracker)
-    {
-      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+  //----------------------------------------
+  // Range span is the span which contains all members.
 
-      const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type);
-      if ( m_tracker.alloc_size() < req_size ) {
-        Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size");
-      }
-    }
+  typedef typename map_type::reference_type  reference_type ;
+  typedef typename map_type::pointer_type    pointer_type ;
 
-  explicit
-  View( Impl::AllocationTracker const & arg_tracker
-      , typename traits::array_layout const & layout )
-    : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr()))
-    , m_offset_map()
-    , m_management()
-    , m_tracker(arg_tracker)
-    {
-      m_offset_map.assign( layout );
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
 
-      const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type);
-      if ( m_tracker.alloc_size() < req_size ) {
-        Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size");
-      }
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+  // Deprecated, use 'span()' instead
+  KOKKOS_INLINE_FUNCTION constexpr size_t capacity() const { return m_map.span(); }
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { return m_map.data(); }
 
-      m_management.set_noncontiguous();
-    }
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr pointer_type ptr_on_device() const { return m_map.data(); }
 
-  //------------------------------------
-  /** \brief  Constructors for subviews requires following
-   *          type-compatibility condition, enforce via StaticAssert.
-   *
-   *  Impl::is_same< View ,
-   *                 typename Impl::ViewSubview< View<D,A1,A2,A3,Impl::ViewDefault>
-   *                                           , ArgType0 , ArgType1 , ArgType2 , ArgType3
-   *                                           , ArgType4 , ArgType5 , ArgType6 , ArgType7
-   *                 >::type >::value
-   */
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
-          , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      , const SubArg2_type & arg2 , const SubArg3_type & arg3
-      , const SubArg4_type & arg4 , const SubArg5_type & arg5
-      , const SubArg6_type & arg6 , const SubArg7_type & arg7
-      );
-
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
-          , class SubArg4_type , class SubArg5_type , class SubArg6_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      , const SubArg2_type & arg2 , const SubArg3_type & arg3
-      , const SubArg4_type & arg4 , const SubArg5_type & arg5
-      , const SubArg6_type & arg6
-      );
-
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
-          , class SubArg4_type , class SubArg5_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      , const SubArg2_type & arg2 , const SubArg3_type & arg3
-      , const SubArg4_type & arg4 , const SubArg5_type & arg5
-      );
-
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
-          , class SubArg4_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      , const SubArg2_type & arg2 , const SubArg3_type & arg3
-      , const SubArg4_type & arg4
-      );
-
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      , const SubArg2_type & arg2 , const SubArg3_type & arg3
-      );
-
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type , class SubArg2_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      , const SubArg2_type & arg2
-      );
-
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type , class SubArg1_type
-          >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0 , const SubArg1_type & arg1
-      );
+  //----------------------------------------
+  // Allow specializations to query their specialized map
 
-  template< class D , class A1 , class A2 , class A3
-          , class SubArg0_type
-          >
   KOKKOS_INLINE_FUNCTION
-  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
-      , const SubArg0_type & arg0
-      );
-
-  //------------------------------------
-  // Assign unmanaged View to portion of execution space's shared memory
-
-  typedef Impl::if_c< ! traits::is_managed ,
-                      const typename traits::execution_space::scratch_memory_space & ,
-                      Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-      if_scratch_memory_constructor ;
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( typename if_scratch_memory_constructor::type space ,
-        const unsigned n0 = 0 ,
-        const unsigned n1 = 0 ,
-        const unsigned n2 = 0 ,
-        const unsigned n3 = 0 ,
-        const unsigned n4 = 0 ,
-        const unsigned n5 = 0 ,
-        const unsigned n6 = 0 ,
-        const unsigned n7 = 0 )
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      typedef typename traits::value_type  value_type_ ;
-
-      enum { align = 8 };
-      enum { mask  = align - 1 };
-
-      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+  const Kokkos::Experimental::Impl::ViewMapping< traits , void > &
+  implementation_map() const { return m_map ; }
 
-      typedef Impl::if_c< ! traits::is_managed ,
-                          value_type_ * ,
-                          Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-        if_device_shmem_pointer ;
+  //----------------------------------------
 
-      // Select the first argument:
-      m_ptr_on_device = if_device_shmem_pointer::select(
-       (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
-    }
+private:
 
-  explicit KOKKOS_INLINE_FUNCTION
-  View( typename if_scratch_memory_constructor::type space ,
-        typename traits::array_layout const & layout)
-    : m_ptr_on_device()
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-    {
-      typedef typename traits::value_type  value_type_ ;
+  enum {
+    is_layout_left = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutLeft >::value ,
 
-      typedef Impl::if_c< ! traits::is_managed ,
-                          value_type_ * ,
-                          Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-        if_device_shmem_pointer ;
+    is_layout_right = std::is_same< typename traits::array_layout
+                                  , Kokkos::LayoutRight >::value ,
 
-      m_offset_map.assign( layout );
-      m_management.set_unmanaged();
-      m_management.set_noncontiguous();
+    is_layout_stride = std::is_same< typename traits::array_layout
+                                   , Kokkos::LayoutStride >::value ,
 
-      enum { align = 8 };
-      enum { mask  = align - 1 };
+    is_default_map =
+      std::is_same< typename traits::specialize , void >::value &&
+      ( is_layout_left || is_layout_right || is_layout_stride )
+  };
 
-      // Select the first argument:
-      m_ptr_on_device = if_device_shmem_pointer::select(
-       (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
-    }
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
 
-  static inline
-  unsigned shmem_size( const unsigned n0 = 0 ,
-                       const unsigned n1 = 0 ,
-                       const unsigned n2 = 0 ,
-                       const unsigned n3 = 0 ,
-                       const unsigned n4 = 0 ,
-                       const unsigned n5 = 0 ,
-                       const unsigned n6 = 0 ,
-                       const unsigned n7 = 0 )
-  {
-    enum { align = 8 };
-    enum { mask  = align - 1 };
+#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify(); \
+  Kokkos::Experimental::Impl::view_verify_operator_bounds ARG ;
 
-    typedef typename traits::value_type  value_type_ ;
+#else
 
-    offset_map_type offset_map ;
+#define KOKKOS_VIEW_OPERATOR_VERIFY( ARG ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace \
+    < Kokkos::Impl::ActiveExecutionMemorySpace , typename traits::memory_space >::verify();
 
-    offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+#endif
 
-    return unsigned( sizeof(value_type_) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ;
-  }
+public:
 
-  //------------------------------------
-  // Is not allocated
+  //------------------------------
+  // Rank 0 operator()
 
+  template< class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  bool is_null() const { return 0 == ptr_on_device() ; }
-
-  //------------------------------------
-  // Operators for scalar (rank zero) views.
-
-  typedef Impl::if_c< traits::rank == 0 ,
-                      typename traits::value_type ,
-                      Impl::ViewError::scalar_operator_called_from_non_scalar_view >
-    if_scalar_operator ;
-
-  typedef Impl::if_c< traits::rank == 0 ,
-                      reference_type ,
-                      Impl::ViewError::scalar_operator_called_from_non_scalar_view >
-    if_scalar_operator_return ;
-  KOKKOS_INLINE_FUNCTION
-  const View & operator = ( const typename if_scalar_operator::type & rhs ) const
+  typename std::enable_if<( Kokkos::Impl::are_integral<Args...>::value
+                            && ( 0 == Rank )
+                          ), reference_type >::type
+  operator()( Args ... args ) const
     {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
-      m_ptr_on_device[ 0 ] = if_scalar_operator::select( rhs );
-      return *this ;
-    }
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,args...) )
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  operator typename if_scalar_operator_return::type () const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
-      return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
+      return m_map.reference();
     }
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename if_scalar_operator_return::type operator()() const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
-      return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
-    }
+  //------------------------------
+  // Rank 1 operator()
 
+  template< typename I0
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename if_scalar_operator_return::type operator*() const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
     {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
-      return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
-    }
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
 
-  //------------------------------------
-  // Array member access operators enabled if
-  // (1) a zero value of all argument types are compile-time comparable to zero
-  // (2) the rank matches the number of arguments
-  // (3) the memory space is valid for the access
-  //------------------------------------
-  // rank 1:
-  // Specialisation for LayoutLeft and LayoutRight since we know its stride 1
+      return m_map.reference(i0);
+    }
 
-  template< typename iType0 >
+  template< typename I0
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && ! is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
 
-      return m_ptr_on_device[ i0 ];
+      return m_map.m_handle[ i0 ];
     }
 
-  template< typename iType0 >
+  template< typename I0
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits,  LayoutLeft, 1, iType0 >::type
-    operator() ( const iType0 & i0 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,Args...>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,args...) )
 
-      return m_ptr_on_device[ i0 ];
+      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
     }
 
-  template< typename iType0 >
+  //------------------------------
+  // Rank 1 operator[]
+
+  template< typename I0 >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator[]( const I0 & i0 ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
 
-      return m_ptr_on_device[ i0 ];
+      return m_map.reference(i0);
     }
 
-  template< typename iType0 >
+  template< typename I0 >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && ! is_layout_stride
+    ), reference_type >::type
+  operator[]( const I0 & i0 ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
 
-      return m_ptr_on_device[ i0 ];
+      return m_map.m_handle[ i0 ];
     }
 
-  template< typename iType0 >
+  template< typename I0 >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits,  LayoutRight, 1, iType0 >::type
-    operator() ( const iType0 & i0 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0>::value
+      && ( 1 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator[]( const I0 & i0 ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0) )
 
-      return m_ptr_on_device[ i0 ];
+      return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
     }
 
-  template< typename iType0 >
+  //------------------------------
+  // Rank 2
+
+  template< typename I0 , typename I1
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
 
-      return m_ptr_on_device[ i0 ];
+      return m_map.reference(i0,i1);
     }
 
-  template< typename iType0 >
+  template< typename I0 , typename I1
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits,
-                 typename Impl::if_c<
-                   Impl::is_same<typename traits::array_layout, LayoutRight>::value ||
-                   Impl::is_same<typename traits::array_layout, LayoutLeft>::value ,
-                   void, typename traits::array_layout>::type,
-                 1, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0) ];
+      return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
     }
 
-  template< typename iType0 >
+  template< typename I0 , typename I1
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits,
-                 typename Impl::if_c<
-                   Impl::is_same<typename traits::array_layout, LayoutRight>::value ||
-                   Impl::is_same<typename traits::array_layout, LayoutLeft>::value ,
-                   void, typename traits::array_layout>::type,
-                 1, iType0 >::type
-    operator() ( const iType0 & i0 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_left && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0) ];
+      return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
     }
 
-  template< typename iType0 >
+  template< typename I0 , typename I1
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type , traits,
-                 typename Impl::if_c<
-                   Impl::is_same<typename traits::array_layout, LayoutRight>::value ||
-                   Impl::is_same<typename traits::array_layout, LayoutLeft>::value ,
-                   void, typename traits::array_layout>::type,
-                 1, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic == 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0) ];
+      return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
     }
 
-  // rank 2:
-
-  template< typename iType0 , typename iType1 >
+  template< typename I0 , typename I1
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 2, iType0, iType1 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_right && ( traits::rank_dynamic != 0 )
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1) ];
+      return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
     }
 
-  template< typename iType0 , typename iType1 >
+  template< typename I0 , typename I1
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 2, iType0, iType1 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
-        const int , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,Args...>::value
+      && ( 2 == Rank )
+      && is_default_map
+      && is_layout_stride
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1) ];
+      return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
+                             i1 * m_map.m_offset.m_stride.S1 ];
     }
 
-  // rank 3:
+  //------------------------------
+  // Rank 3
 
-  template< typename iType0 , typename iType1 , typename iType2 >
+  template< typename I0 , typename I1 , typename I2
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+      && ( 3 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2) ];
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
     }
 
-  template< typename iType0 , typename iType1 , typename iType2 >
+  template< typename I0 , typename I1 , typename I2
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int ,
-        const int , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,Args...>::value
+      && ( 3 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2) ];
+      return m_map.reference(i0,i1,i2);
     }
 
-  // rank 4:
+  //------------------------------
+  // Rank 4
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+      && ( 4 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ];
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
     }
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const int , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,Args...>::value
+      && ( 4 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ];
+      return m_map.reference(i0,i1,i2,i3);
     }
 
-  // rank 5:
+  //------------------------------
+  // Rank 5
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+      && ( 5 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ];
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
     }
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const int , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,Args...>::value
+      && ( 5 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ];
+      return m_map.reference(i0,i1,i2,i3,i4);
     }
 
-  // rank 6:
+  //------------------------------
+  // Rank 6
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 6,
-                                      iType0, iType1, iType2, iType3 , iType4, iType5 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+      && ( 6 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
     }
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 6,
-                                      iType0, iType1, iType2, iType3 , iType4, iType5 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const int , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,Args...>::value
+      && ( 6 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
     }
 
-  // rank 7:
+  //------------------------------
+  // Rank 7
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 7,
-                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+      && ( 7 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
     }
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 7,
-                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,Args...>::value
+      && ( 7 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
     }
 
-  // rank 8:
+  //------------------------------
+  // Rank 8
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 8,
-                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+      && ( 8 == Rank )
+      && is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+      return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
     }
 
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7
+          , class ... Args >
   KOKKOS_FORCEINLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< reference_type ,
-                                      traits, typename traits::array_layout, 8,
-                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+  typename std::enable_if<
+    ( Kokkos::Impl::are_integral<I0,I1,I2,I3,I4,I5,I6,I7,Args...>::value
+      && ( 8 == Rank )
+      && ! is_default_map
+    ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
+            , Args ... args ) const
     {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      KOKKOS_VIEW_OPERATOR_VERIFY( (m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
 
-      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
     }
 
-  //------------------------------------
-  // Access to the underlying contiguous storage of this view specialization.
-  // These methods are specific to specialization of a view.
+#undef KOKKOS_VIEW_OPERATOR_VERIFY
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename traits::value_type * ptr_on_device() const
-    { return (typename traits::value_type *) m_ptr_on_device ; }
+  //----------------------------------------
+  // Standard destructor, constructors, and assignment operators
 
-  // Stride of physical storage, dimensioned to at least Rank
-  template< typename iType >
   KOKKOS_INLINE_FUNCTION
-  void stride( iType * const s ) const
-  { m_offset_map.stride(s); }
+  ~View() {}
 
-  // Count of contiguously allocated data members including padding.
   KOKKOS_INLINE_FUNCTION
-  typename traits::size_type capacity() const
-  { return m_offset_map.capacity(); }
+  View() : m_track(), m_map() {}
 
-  // If the view data can be treated (deep copied)
-  // as a contiguous block of memory.
   KOKKOS_INLINE_FUNCTION
-  bool is_contiguous() const
-  { return m_management.is_contiguous(); }
+  View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
 
-  const Impl::AllocationTracker & tracker() const { return m_tracker; }
+  KOKKOS_INLINE_FUNCTION
+  View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  //----------------------------------------
+  // Compatible view copy constructor and assignment
+  // may assign unmanaged from managed.
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,RP...> & rhs )
+    : m_track( rhs.m_track , traits::is_managed )
+    , m_map()
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class ... RP >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,RP...> & rhs )
+    {
+      typedef typename View<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.assign( rhs.m_track , traits::is_managed );
+      return *this ;
+    }
+
+  //----------------------------------------
+  // Compatible subview constructor
+  // may assign unmanaged from managed.
+
+  template< class RT , class ... RP , class Arg0 , class ... Args >
+  KOKKOS_INLINE_FUNCTION
+  View( const View< RT , RP... > & src_view
+      , const Arg0 & arg0 , Args ... args )
+    : m_track( src_view.m_track , traits::is_managed )
+    , m_map()
+    {
+      typedef View< RT , RP... > SrcType ;
+
+      typedef Kokkos::Experimental::Impl::ViewMapping
+        < void /* deduce destination view type from source view traits */
+        , typename SrcType::traits
+        , Arg0 , Args... > Mapping ;
+
+      typedef typename Mapping::type DstType ;
+
+      static_assert( Kokkos::Experimental::Impl::ViewMapping< traits , typename DstType::traits , void >::is_assignable
+        , "Subview construction requires compatible view and subview arguments" );
+
+      Mapping::assign( m_map, src_view.m_map, arg0 , args... );
+    }
+
+  //----------------------------------------
+  // Allocation tracking properties
+
+  KOKKOS_INLINE_FUNCTION
+  int use_count() const
+    { return m_track.use_count(); }
+
+  inline
+  const std::string label() const
+    { return m_track.template get_label< typename traits::memory_space >(); }
+
+  //----------------------------------------
+  // Allocation according to allocation properties and array layout
+
+  template< class ... P >
+  explicit inline
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+    : m_track()
+    , m_map()
+    {
+      // Append layout and spaces if not input
+      typedef Impl::ViewCtorProp< P ... > alloc_prop_input ;
+
+      // use 'std::integral_constant<unsigned,I>' for non-types
+      // to avoid duplicate class error.
+      typedef Impl::ViewCtorProp
+        < P ...
+        , typename std::conditional
+            < alloc_prop_input::has_label
+            , std::integral_constant<unsigned,0>
+            , typename std::string
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_memory_space
+            , std::integral_constant<unsigned,1>
+            , typename traits::device_type::memory_space
+            >::type
+        , typename std::conditional
+            < alloc_prop_input::has_execution_space
+            , std::integral_constant<unsigned,2>
+            , typename traits::device_type::execution_space
+            >::type
+        > alloc_prop ;
+
+      static_assert( traits::is_managed
+                   , "View allocation constructor requires managed memory" );
+
+      if ( alloc_prop::initialize &&
+           ! alloc_prop::execution_space::is_initialized() ) {
+        // If initializing view data then
+        // the execution space must be initialized.
+        Kokkos::Impl::throw_runtime_exception("Constructing View and initializing data with uninitialized execution space");
+      }
+
+      // Copy the input allocation properties with possibly defaulted properties
+      alloc_prop prop( arg_prop );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      // If allocating in CudaUVMSpace must fence before and after
+      // the allocation to protect against possible concurrent access
+      // on the CPU and the GPU.
+      // Fence using the trait's executon space (which will be Kokkos::Cuda)
+      // to avoid incomplete type errors from usng Kokkos::Cuda directly.
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      Kokkos::Experimental::Impl::SharedAllocationRecord<> *
+        record = m_map.allocate_shared( prop , arg_layout );
+
+//------------------------------------------------------------
+#if defined( KOKKOS_HAVE_CUDA )
+      if ( std::is_same< Kokkos::CudaUVMSpace , typename traits::device_type::memory_space >::value ) {
+        traits::device_type::memory_space::execution_space::fence();
+      }
+#endif
+//------------------------------------------------------------
+
+      // Setup and initialization complete, start tracking
+      m_track.assign_allocated_record_to_uninitialized( record );
+    }
+
+  // Wrap memory according to properties and array layout
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , typename traits::array_layout
+                               >::type const & arg_layout
+      )
+    : m_track() // No memory tracking
+    , m_map( arg_prop , arg_layout )
+    {
+      static_assert(
+        std::is_same< pointer_type
+                    , typename Impl::ViewCtorProp< P... >::pointer_type
+                    >::value ,
+        "Constructing View to wrap user memory must supply matching pointer type" );
+    }
+
+  // Simple dimension-only layout
+  template< class ... P >
+  explicit inline
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< ! Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( arg_prop
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  template< class ... P >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const Impl::ViewCtorProp< P ... > & arg_prop
+      , typename std::enable_if< Impl::ViewCtorProp< P... >::has_pointer
+                               , size_t
+                               >::type const arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( arg_prop
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // Allocate with label and layout
+  template< typename Label >
+  explicit inline
+  View( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
+          typename traits::array_layout >::type const & arg_layout
+      )
+    : View( Impl::ViewCtorProp< std::string >( arg_label ) , arg_layout )
+    {}
+
+  // Allocate label and layout, must disambiguate from subview constructor.
+  template< typename Label >
+  explicit inline
+  View( const Label & arg_label
+      , typename std::enable_if<
+          Kokkos::Experimental::Impl::is_view_label<Label>::value ,
+        const size_t >::type arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp< std::string >( arg_label )
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  // For backward compatibility
+  explicit inline
+  View( const ViewAllocateWithoutInitializing & arg_prop
+      , const typename traits::array_layout & arg_layout
+      )
+    : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
+          , arg_layout
+          )
+    {}
+
+  explicit inline
+  View( const ViewAllocateWithoutInitializing & arg_prop
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp< std::string , Kokkos::Experimental::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::Experimental::WithoutInitializing )
+          , typename traits::array_layout
+              ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+              , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t required_allocation_size(
+                                       const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span(
+        typename traits::array_layout
+          ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+          , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+    }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( pointer_type arg_ptr
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr)
+          , typename traits::array_layout
+             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+          )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( pointer_type arg_ptr
+      , const typename traits::array_layout & arg_layout
+      )
+    : View( Impl::ViewCtorProp<pointer_type>(arg_ptr) , arg_layout )
+    {}
+
+  //----------------------------------------
+  // Shared scratch memory constructor
+
+  static inline
+  size_t shmem_size( const size_t arg_N0 = ~size_t(0) ,
+                     const size_t arg_N1 = ~size_t(0) ,
+                     const size_t arg_N2 = ~size_t(0) ,
+                     const size_t arg_N3 = ~size_t(0) ,
+                     const size_t arg_N4 = ~size_t(0) ,
+                     const size_t arg_N5 = ~size_t(0) ,
+                     const size_t arg_N6 = ~size_t(0) ,
+                     const size_t arg_N7 = ~size_t(0) )
+  {
+    const size_t num_passed_args =
+      ( arg_N0 != ~size_t(0) ) + ( arg_N1 != ~size_t(0) ) + ( arg_N2 != ~size_t(0) ) +
+      ( arg_N3 != ~size_t(0) ) + ( arg_N4 != ~size_t(0) ) + ( arg_N5 != ~size_t(0) ) +
+      ( arg_N6 != ~size_t(0) ) + ( arg_N7 != ~size_t(0) );
+
+    if ( std::is_same<typename traits::specialize,void>::value && num_passed_args != traits::rank_dynamic ) {
+      Kokkos::abort( "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n" );
+    }
+
+    return map_type::memory_span(
+           typename traits::array_layout
+            ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+            , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const typename traits::array_layout & arg_layout )
+    : View( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem( map_type::memory_span( arg_layout ) ) ) )
+         , arg_layout )
+    {}
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0 )
+    : View( Impl::ViewCtorProp<pointer_type>(
+              reinterpret_cast<pointer_type>(
+                arg_space.get_shmem(
+                  map_type::memory_span(
+                    typename traits::array_layout
+                     ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) ) ) ) )
+          , typename traits::array_layout
+             ( arg_N0 , arg_N1 , arg_N2 , arg_N3
+             , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+       )
+    {}
 };
 
-} /* namespace Kokkos */
+
+ /** \brief Temporary free function rank()
+  *         until rank() is implemented
+  *         in the View
+  */
+  template < typename D , class ... P >
+  KOKKOS_INLINE_FUNCTION
+  constexpr unsigned rank( const View<D , P...> & V ) { return V.Rank; } //Temporary until added to view
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
+template< class V , class ... Args >
+using Subview =
+  typename Kokkos::Experimental::Impl::ViewMapping
+    < void /* deduce subview type from source view traits */
+    , typename V::traits
+    , Args ...
+    >::type ;
 
-template< class LT , class LL , class LD , class LM , class LS ,
-          class RT , class RL , class RD , class RM , class RS >
+template< class D, class ... P , class ... Args >
 KOKKOS_INLINE_FUNCTION
-typename Impl::enable_if<( Impl::is_same< LS , RS >::value ), bool >::type
-operator == ( const View<LT,LL,LD,LM,LS> & lhs ,
-              const View<RT,RL,RD,RM,RS> & rhs )
+typename Kokkos::Experimental::Impl::ViewMapping
+  < void /* deduce subview type from source view traits */
+  , ViewTraits< D , P... >
+  , Args ...
+  >::type
+subview( const View< D, P... > & src , Args ... args )
 {
-  // Same data, layout, dimensions
-  typedef ViewTraits<LT,LL,LD,LM> lhs_traits ;
-  typedef ViewTraits<RT,RL,RD,RM> rhs_traits ;
-
-  return
-    Impl::is_same< typename lhs_traits::const_data_type ,
-                   typename rhs_traits::const_data_type >::value &&
-    Impl::is_same< typename lhs_traits::array_layout ,
-                   typename rhs_traits::array_layout >::value &&
-    Impl::is_same< typename lhs_traits::memory_space ,
-                   typename rhs_traits::memory_space >::value &&
-    Impl::is_same< typename lhs_traits::specialize ,
-                   typename rhs_traits::specialize >::value &&
-    lhs.ptr_on_device() == rhs.ptr_on_device() &&
-    lhs.shape()         == rhs.shape() ;
+  static_assert( View< D , P... >::Rank == sizeof...(Args) ,
+    "subview requires one argument for each source View rank" );
+
+  return typename
+    Kokkos::Experimental::Impl::ViewMapping
+      < void /* deduce subview type from source view traits */
+      , ViewTraits< D , P ... >
+      , Args ... >::type( src , args ... );
 }
 
-template< class LT , class LL , class LD , class LM , class LS ,
-          class RT , class RL , class RD , class RM , class RS >
+template< class MemoryTraits , class D, class ... P , class ... Args >
 KOKKOS_INLINE_FUNCTION
-bool operator != ( const View<LT,LL,LD,LM,LS> & lhs ,
-                   const View<RT,RL,RD,RM,RS> & rhs )
+typename Kokkos::Experimental::Impl::ViewMapping
+  < void /* deduce subview type from source view traits */
+  , ViewTraits< D , P... >
+  , Args ...
+  >::template apply< MemoryTraits >::type
+subview( const View< D, P... > & src , Args ... args )
 {
-  return ! operator==( lhs , rhs );
+  static_assert( View< D , P... >::Rank == sizeof...(Args) ,
+    "subview requires one argument for each source View rank" );
+
+  return typename
+    Kokkos::Experimental::Impl::ViewMapping
+      < void /* deduce subview type from source view traits */
+      , ViewTraits< D , P ... >
+      , Args ... >
+      ::template apply< MemoryTraits >
+      ::type( src , args ... );
 }
 
-//----------------------------------------------------------------------------
 
 
-} // namespace Kokkos
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+namespace Experimental {
 
-//----------------------------------------------------------------------------
-/** \brief  Deep copy a value into a view.
- */
-template< class DT , class DL , class DD , class DM , class DS >
-inline
-void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
-                typename Impl::enable_if<(
-                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type ,
-                                 typename ViewTraits<DT,DL,DD,DM>::value_type >::value
-                ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value )
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const View<LT,LP...> & lhs ,
+                   const View<RT,RP...> & rhs )
 {
-  Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value );
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LP...>  lhs_traits ;
+  typedef ViewTraits<RT,RP...>  rhs_traits ;
+
+  return
+    std::is_same< typename lhs_traits::const_value_type ,
+                  typename rhs_traits::const_value_type >::value &&
+    std::is_same< typename lhs_traits::array_layout ,
+                  typename rhs_traits::array_layout >::value &&
+    std::is_same< typename lhs_traits::memory_space ,
+                  typename rhs_traits::memory_space >::value &&
+    unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) &&
+    lhs.data()        == rhs.data() &&
+    lhs.span()        == rhs.span() &&
+    lhs.dimension_0() == rhs.dimension_0() &&
+    lhs.dimension_1() == rhs.dimension_1() &&
+    lhs.dimension_2() == rhs.dimension_2() &&
+    lhs.dimension_3() == rhs.dimension_3() &&
+    lhs.dimension_4() == rhs.dimension_4() &&
+    lhs.dimension_5() == rhs.dimension_5() &&
+    lhs.dimension_6() == rhs.dimension_6() &&
+    lhs.dimension_7() == rhs.dimension_7();
 }
 
-template< class ST , class SL , class SD , class SM , class SS >
-inline
-typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type
-deep_copy( ST & dst , const View<ST,SL,SD,SM,SS> & src )
+template< class LT , class ... LP , class RT , class ... RP >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const View<LT,LP...> & lhs ,
+                   const View<RT,RP...> & rhs )
 {
-  typedef  ViewTraits<ST,SL,SD,SM>  src_traits ;
-  typedef typename src_traits::memory_space  src_memory_space ;
-  Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.ptr_on_device() , sizeof(ST) );
+  return ! ( operator==(lhs,rhs) );
 }
 
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
-/** \brief  A deep copy between views of compatible type, and rank zero.
- */
-template< class DT , class DL , class DD , class DM , class DS ,
-          class ST , class SL , class SD , class SM , class SS >
-inline
-void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
-                const View<ST,SL,SD,SM,SS> & src ,
-                typename Impl::enable_if<(
-                  // Same type and destination is not constant:
-                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
-                                 typename View<ST,SL,SD,SM,SS>::non_const_value_type >::value
-                  &&
-                  // Rank zero:
-                  ( unsigned(View<DT,DL,DD,DM,DS>::rank) == unsigned(0) ) &&
-                  ( unsigned(View<ST,SL,SD,SM,SS>::rank) == unsigned(0) )
-                )>::type * = 0 )
-{
-  typedef  View<DT,DL,DD,DM,DS>  dst_type ;
-  typedef  View<ST,SL,SD,SM,SS>  src_type ;
 
-  typedef typename dst_type::memory_space  dst_memory_space ;
-  typedef typename src_type::memory_space  src_memory_space ;
-  typedef typename src_type::value_type    value_type ;
+namespace Kokkos {
+namespace Impl {
 
-  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
-    Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , sizeof(value_type) );
-  }
-}
+inline
+void shared_allocation_tracking_claim_and_disable()
+{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
 
-//----------------------------------------------------------------------------
-/** \brief  A deep copy between views of the default specialization, compatible type,
- *          same non-zero rank, same contiguous layout.
- */
-template< class DT , class DL , class DD , class DM ,
-          class ST , class SL , class SD , class SM >
 inline
-void deep_copy( const View<DT,DL,DD,DM,Impl::ViewDefault> & dst ,
-                const View<ST,SL,SD,SM,Impl::ViewDefault> & src ,
-                typename Impl::enable_if<(
-                  // Same type and destination is not constant:
-                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type ,
-                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value
-                  &&
-                  // Same non-zero rank:
-                  ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) ==
-                    unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) )
-                  &&
-                  ( 0 < unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) )
-                  &&
-                  // Same layout:
-                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout ,
-                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value
-                )>::type * = 0 )
-{
-  typedef  View<DT,DL,DD,DM,Impl::ViewDefault>  dst_type ;
-  typedef  View<ST,SL,SD,SM,Impl::ViewDefault>  src_type ;
+void shared_allocation_tracking_release_and_enable()
+{ Kokkos::Experimental::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
 
-  typedef typename dst_type::memory_space  dst_memory_space ;
-  typedef typename src_type::memory_space  src_memory_space ;
+} /* namespace Impl */
+} /* namespace Kokkos */
 
-  enum { is_contiguous = // Contiguous (e.g., non-strided, non-tiled) layout
-           Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutLeft >::value ||
-           Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutRight >::value };
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
-  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
 
-    // Same shape (dimensions)
+template< class OutputView , typename Enable = void >
+struct ViewFill {
 
-    const bool shapes_are_equal = dst.shape() == src.shape();
+  typedef typename OutputView::const_value_type  const_value_type ;
 
-    if ( shapes_are_equal && is_contiguous && dst.capacity() == src.capacity() ) {
+  const OutputView output ;
+  const_value_type input ;
 
-      // Views span equal length contiguous range.
-      // Assuming can perform a straight memory copy over this range.
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    const size_t n1 = output.dimension_1();
+    const size_t n2 = output.dimension_2();
+    const size_t n3 = output.dimension_3();
+    const size_t n4 = output.dimension_4();
+    const size_t n5 = output.dimension_5();
+    const size_t n6 = output.dimension_6();
+    const size_t n7 = output.dimension_7();
+
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
 
-      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space  execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
 
-      Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
-    }
-    else {
-      // Destination view's execution space must be able to directly access source memory space
-      // in order for the ViewRemap functor run in the destination memory space's execution space.
-      size_t stride[8];
-      src.stride(stride);
-      size_t size_stride = stride[0]*src.dimension_0();
-      size_t size_dim = src.dimension_0();
-      for(int i = 1; i<src.rank; i++) {
-        if(stride[i]*src.dimension(i)>size_stride)
-          size_stride = stride[i]*src.dimension(i);
-        size_dim*=src.dimension(i);
-      }
+      const Kokkos::Impl::ParallelFor< ViewFill , Policy > closure( *this , Policy( 0 , output.dimension_0() ) );
 
-      if( shapes_are_equal && size_stride == size_dim) {
-        const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+      closure.execute();
 
-        Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
-      } else {
-        Impl::ViewRemap< dst_type , src_type >( dst , src );
-      }
+      execution_space::fence();
     }
-  }
-}
-
+};
 
-/** \brief Deep copy equal dimension arrays in the same space which
- *         have different layouts or specializations.
- */
-template< class DT , class DL , class DD , class DM , class DS ,
-          class ST , class SL , class SD , class SM , class SS >
-inline
-void deep_copy( const View< DT, DL, DD, DM, DS > & dst ,
-                const View< ST, SL, SD, SM, SS > & src ,
-                const typename Impl::enable_if<(
-                  // Same type and destination is not constant:
-                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
-                                 typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value
-                  &&
-                  // Source memory space is accessible to destination memory space
-                  Impl::VerifyExecutionCanAccessMemorySpace< typename View<DT,DL,DD,DM,DS>::memory_space
-                                                           , typename View<ST,SL,SD,SM,SS>::memory_space >::value
-                  &&
-                  // Same non-zero rank
-                  ( unsigned( View<DT,DL,DD,DM,DS>::rank ) ==
-                    unsigned( View<ST,SL,SD,SM,SS>::rank ) )
-                  &&
-                  ( 0 < unsigned( View<DT,DL,DD,DM,DS>::rank ) )
-                  &&
-                  // Different layout or different specialization:
-                  ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout ,
-                                       typename View<ST,SL,SD,SM,SS>::array_layout >::value )
-                    ||
-                    ( ! Impl::is_same< DS , SS >::value )
-                  )
-                )>::type * = 0 )
-{
-  typedef View< DT, DL, DD, DM, DS > dst_type ;
-  typedef View< ST, SL, SD, SM, SS > src_type ;
+template< class OutputView >
+struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > {
+  ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
+    {
+      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
+        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
+    }
+};
 
-  assert_shapes_equal_dimension( dst.shape() , src.shape() );
+template< class OutputView , class InputView , class ExecSpace = typename OutputView::execution_space >
+struct ViewRemap {
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_t n0 ;
+  const size_t n1 ;
+  const size_t n2 ;
+  const size_t n3 ;
+  const size_t n4 ;
+  const size_t n5 ;
+  const size_t n6 ;
+  const size_t n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef Kokkos::RangePolicy< ExecSpace > Policy ;
+      const Kokkos::Impl::ParallelFor< ViewRemap , Policy > closure( *this , Policy( 0 , n0 ) );
+      closure.execute();
+    }
 
-  Impl::ViewRemap< dst_type , src_type >( dst , src );
-}
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
 
-}
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+namespace Experimental {
 
-//----------------------------------------------------------------------------
-/** \brief  Deep copy a value into a view.
- */
-template< class ExecSpace, class DT , class DL , class DD , class DM , class DS >
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class DT , class ... DP >
 inline
-void deep_copy( const ExecSpace&, const View<DT,DL,DD,DM,DS> & dst ,
-                typename Impl::enable_if<(
-                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type ,
-                                 typename ViewTraits<DT,DL,DD,DM>::value_type >::value
-                ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value )
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
 {
-  Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value );
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
+
+  Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value );
 }
 
-template< class ExecSpace, class ST , class SL , class SD , class SM , class SS >
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ST , class ... SP >
 inline
-typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type
-deep_copy( const ExecSpace& exec,  ST & dst , const View<ST,SL,SD,SM,SS> & src )
+void deep_copy
+  ( typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
 {
-  typedef  ViewTraits<ST,SL,SD,SM>  src_traits ;
+  static_assert( ViewTraits<ST,SP...>::rank == 0
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
+
+  typedef ViewTraits<ST,SP...>               src_traits ;
   typedef typename src_traits::memory_space  src_memory_space ;
-  Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace >( exec , & dst , src.ptr_on_device() , sizeof(ST) );
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
 }
 
 //----------------------------------------------------------------------------
-/** \brief  A deep copy between views of compatible type, and rank zero.
- */
-template< class ExecSpace ,
-          class DT , class DL , class DD , class DM , class DS ,
-          class ST , class SL , class SD , class SM , class SS >
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class DT , class ... DP , class ST , class ... SP >
 inline
-void deep_copy( const ExecSpace& exec,
-                const View<DT,DL,DD,DM,DS> & dst ,
-                const View<ST,SL,SD,SM,SS> & src ,
-                typename Impl::enable_if<(
-                  // Same type and destination is not constant:
-                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
-                                 typename View<ST,SL,SD,SM,SS>::non_const_value_type >::value
-                  &&
-                  // Rank zero:
-                  ( unsigned(View<DT,DL,DD,DM,DS>::rank) == unsigned(0) ) &&
-                  ( unsigned(View<ST,SL,SD,SM,SS>::rank) == unsigned(0) )
-                )>::type * = 0 )
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
+      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
+  )>::type * = 0 )
 {
-  typedef  View<DT,DL,DD,DM,DS>  dst_type ;
-  typedef  View<ST,SL,SD,SM,SS>  src_type ;
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
+    , "deep_copy requires matching non-const destination type" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
 
+  typedef typename dst_type::value_type    value_type ;
   typedef typename dst_type::memory_space  dst_memory_space ;
   typedef typename src_type::memory_space  src_memory_space ;
-  typedef typename src_type::value_type    value_type ;
 
-  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
-    Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >( exec , dst.ptr_on_device() , src.ptr_on_device() , sizeof(value_type) );
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) );
   }
 }
 
@@ -1632,172 +1799,395 @@ void deep_copy( const ExecSpace& exec,
 /** \brief  A deep copy between views of the default specialization, compatible type,
  *          same non-zero rank, same contiguous layout.
  */
-template< class ExecSpace ,
-          class DT , class DL , class DD , class DM ,
-          class ST , class SL , class SD , class SM >
+template< class DT , class ... DP , class ST , class ... SP >
 inline
-void deep_copy( const ExecSpace & exec,
-                const View<DT,DL,DD,DM,Impl::ViewDefault> & dst ,
-                const View<ST,SL,SD,SM,Impl::ViewDefault> & src ,
-                typename Impl::enable_if<(
-                  // Same type and destination is not constant:
-                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type ,
-                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value
-                  &&
-                  // Same non-zero rank:
-                  ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) ==
-                    unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) )
-                  &&
-                  ( 0 < unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) )
-                  &&
-                  // Same layout:
-                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout ,
-                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value
-                )>::type * = 0 )
+void deep_copy
+  ( const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
+      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
+  )>::type * = 0 )
 {
-  typedef  View<DT,DL,DD,DM,Impl::ViewDefault>  dst_type ;
-  typedef  View<ST,SL,SD,SM,Impl::ViewDefault>  src_type ;
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  static_assert(
+    ( unsigned(ViewTraits<DT,DP...>::rank) ==
+      unsigned(ViewTraits<ST,SP...>::rank) )
+    , "deep_copy requires Views of equal rank" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
+
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+    if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                       typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                           typename ViewTraits<ST,SP...>::array_layout >::value
+             &&
+             ( std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                             typename Kokkos::LayoutLeft>::value
+             ||
+               std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                             typename Kokkos::LayoutRight>::value
+             )
+           )
+           ||
+           ( ViewTraits<DT,DP...>::rank == 1 &&
+             ViewTraits<ST,SP...>::rank == 1 )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                            typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+         (
+           ( std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                           typename ViewTraits<ST,SP...>::array_layout >::value
+             &&
+             std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                          typename Kokkos::LayoutStride>::value
+           )
+           ||
+           ( ViewTraits<DT,DP...>::rank == 1 &&
+             ViewTraits<ST,SP...>::rank == 1 )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() &&
+         dst.stride_0() == src.stride_0() &&
+         dst.stride_1() == src.stride_1() &&
+         dst.stride_2() == src.stride_2() &&
+         dst.stride_3() == src.stride_3() &&
+         dst.stride_4() == src.stride_4() &&
+         dst.stride_5() == src.stride_5() &&
+         dst.stride_6() == src.stride_6() &&
+         dst.stride_7() == src.stride_7()
+         ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
+}
 
-  typedef typename dst_type::memory_space  dst_memory_space ;
-  typedef typename src_type::memory_space  src_memory_space ;
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
-  enum { is_contiguous = // Contiguous (e.g., non-strided, non-tiled) layout
-           Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutLeft >::value ||
-           Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutRight >::value };
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
-  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+namespace Kokkos {
+namespace Experimental {
 
-    // Same shape (dimensions)
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class ExecSpace ,class DT , class ... DP >
+inline
+void deep_copy
+  ( const ExecSpace &
+  , const View<DT,DP...> & dst
+  , typename ViewTraits<DT,DP...>::const_value_type & value
+  , typename std::enable_if<
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::non_const_value_type ,
+                  typename ViewTraits<DT,DP...>::value_type >::value
+    , "deep_copy requires non-const type" );
 
-    const bool shapes_are_equal = dst.shape() == src.shape();
+  Kokkos::Experimental::Impl::ViewFill< View<DT,DP...> >( dst , value );
+}
 
-    if ( shapes_are_equal && is_contiguous && dst.capacity() == src.capacity() ) {
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ExecSpace , class ST , class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , typename ViewTraits<ST,SP...>::non_const_value_type & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value
+    >::type * = 0 )
+{
+  static_assert( ViewTraits<ST,SP...>::rank == 0
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
 
-      // Views span equal length contiguous range.
-      // Assuming can perform a straight memory copy over this range.
+  typedef ViewTraits<ST,SP...>               src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space , ExecSpace >
+    ( exec_space , & dst , src.data() , sizeof(ST) );
+}
 
-      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class ExecSpace , class DT , class ... DP , class ST , class ... SP >
+inline
+void deep_copy
+  ( const ExecSpace & exec_space
+  , const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) == unsigned(0) &&
+      unsigned(ViewTraits<ST,SP...>::rank) == unsigned(0) )
+  )>::type * = 0 )
+{
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<ST,SP...>::non_const_value_type >::value
+    , "deep_copy requires matching non-const destination type" );
 
-      Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >( exec , dst.ptr_on_device() , src.ptr_on_device() , nbytes );
-    }
-    else {
-      // Destination view's execution space must be able to directly access source memory space
-      // in order for the ViewRemap functor run in the destination memory space's execution space.
-      size_t stride[8];
-      src.stride(stride);
-      size_t size_stride = stride[0]*src.dimension_0();
-      size_t size_dim = src.dimension_0();
-      for(int i = 1; i<src.rank; i++) {
-        if(stride[i]*src.dimension(i)>size_stride)
-          size_stride = stride[i]*src.dimension(i);
-        size_dim*=src.dimension(i);
-      }
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
 
-      if( shapes_are_equal && size_stride == size_dim) {
-        const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+  typedef typename dst_type::value_type    value_type ;
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
 
-        Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >( exec , dst.ptr_on_device() , src.ptr_on_device() , nbytes );
-      } else {
-        Impl::ViewRemap< dst_type , src_type >( dst , src );
-      }
-    }
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
+      ( exec_space , dst.data() , src.data() , sizeof(value_type) );
   }
 }
 
-
-/** \brief Deep copy equal dimension arrays in the same space which
- *         have different layouts or specializations.
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank, same contiguous layout.
  */
-template< class ExecSpace ,
-          class DT , class DL , class DD , class DM , class DS ,
-          class ST , class SL , class SD , class SM , class SS >
+template< class ExecSpace , class DT, class ... DP, class ST, class ... SP >
 inline
-void deep_copy( const ExecSpace& ,
-                const View< DT, DL, DD, DM, DS > & dst ,
-                const View< ST, SL, SD, SM, SS > & src ,
-                const typename Impl::enable_if<(
-                  // Same type and destination is not constant:
-                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
-                                 typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value
-                  &&
-                  // Source memory space is accessible to destination memory space
-                  Impl::VerifyExecutionCanAccessMemorySpace< typename View<DT,DL,DD,DM,DS>::memory_space
-                                                           , typename View<ST,SL,SD,SM,SS>::memory_space >::value
-                  &&
-                  // Same non-zero rank
-                  ( unsigned( View<DT,DL,DD,DM,DS>::rank ) ==
-                    unsigned( View<ST,SL,SD,SM,SS>::rank ) )
-                  &&
-                  ( 0 < unsigned( View<DT,DL,DD,DM,DS>::rank ) )
-                  &&
-                  // Different layout or different specialization:
-                  ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout ,
-                                       typename View<ST,SL,SD,SM,SS>::array_layout >::value )
-                    ||
-                    ( ! Impl::is_same< DS , SS >::value )
-                  )
-                )>::type * = 0 )
+void deep_copy
+  ( const ExecSpace & exec_space
+  , const View<DT,DP...> & dst
+  , const View<ST,SP...> & src
+  , typename std::enable_if<(
+    Kokkos::Impl::is_execution_space< ExecSpace >::value &&
+    std::is_same< typename ViewTraits<DT,DP...>::specialize , void >::value &&
+    std::is_same< typename ViewTraits<ST,SP...>::specialize , void >::value &&
+    ( unsigned(ViewTraits<DT,DP...>::rank) != 0 ||
+      unsigned(ViewTraits<ST,SP...>::rank) != 0 )
+  )>::type * = 0 )
 {
-  typedef View< DT, DL, DD, DM, DS > dst_type ;
-  typedef View< ST, SL, SD, SM, SS > src_type ;
-
-  assert_shapes_equal_dimension( dst.shape() , src.shape() );
-
-  Impl::ViewRemap< dst_type , src_type >( dst , src );
+  static_assert(
+    std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                  typename ViewTraits<DT,DP...>::non_const_value_type >::value
+    , "deep_copy requires non-const destination type" );
+
+  static_assert(
+    ( unsigned(ViewTraits<DT,DP...>::rank) ==
+      unsigned(ViewTraits<ST,SP...>::rank) )
+    , "deep_copy requires Views of equal rank" );
+
+  typedef View<DT,DP...>  dst_type ;
+  typedef View<ST,SP...>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename src_type::execution_space  src_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  enum { SrcExecCanAccessDst =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename src_execution_space::memory_space , dst_memory_space >::value };
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+    if ( std::is_same< typename ViewTraits<DT,DP...>::value_type ,
+                       typename ViewTraits<ST,SP...>::non_const_value_type >::value &&
+         (
+           std::is_same< typename ViewTraits<DT,DP...>::array_layout ,
+                         typename ViewTraits<ST,SP...>::array_layout >::value
+           ||
+           ( ViewTraits<DT,DP...>::rank == 1 &&
+             ViewTraits<ST,SP...>::rank == 1 )
+         ) &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space , ExecSpace >
+        ( exec_space , dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else if ( SrcExecCanAccessDst ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type , src_execution_space >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
 }
 
-}
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorViewType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::View<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorType {
+  // The incoming view_type
+  typedef typename Kokkos::Experimental::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::Experimental::View<data_type,array_layout,Space> view_type;
+};
 
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed &&
-    !Impl::is_same<L,LayoutStride>::value
-  ), typename View<T,L,D,M,S>::HostMirror >::type
+}
+
+template< class T , class ... P >
 inline
-create_mirror( const View<T,L,D,M,S> & src )
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror( const Kokkos::Experimental::View<T,P...> & src
+             , typename std::enable_if<
+                 ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
 {
-  typedef View<T,L,D,M,S>                  view_type ;
-  typedef typename view_type::HostMirror    host_view_type ;
-
-  // 'view' is managed therefore we can allocate a
-  // compatible host_view through the ordinary constructor.
-
-  std::string label = src.tracker().label();
-  label.append("_mirror");
-
-  return host_view_type( label ,
-                         src.dimension_0() ,
-                         src.dimension_1() ,
-                         src.dimension_2() ,
-                         src.dimension_3() ,
-                         src.dimension_4() ,
-                         src.dimension_5() ,
-                         src.dimension_6() ,
-                         src.dimension_7() );
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , src.dimension_0()
+                 , src.dimension_1()
+                 , src.dimension_2()
+                 , src.dimension_3()
+                 , src.dimension_4()
+                 , src.dimension_5()
+                 , src.dimension_6()
+                 , src.dimension_7() );
 }
 
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed &&
-    Impl::is_same<L,LayoutStride>::value
-  ), typename View<T,L,D,M,S>::HostMirror >::type
+template< class T , class ... P >
 inline
-create_mirror( const View<T,L,D,M,S> & src )
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror( const Kokkos::Experimental::View<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename Kokkos::Experimental::ViewTraits<T,P...>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
 {
-  typedef View<T,L,D,M,S>                  view_type ;
-  typedef typename view_type::HostMirror    host_view_type ;
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
 
-  // 'view' is managed therefore we can allocate a
-  // compatible host_view through the ordinary constructor.
+  Kokkos::LayoutStride layout ;
 
-  std::string label = src.tracker().label();
-  label.append("_mirror");
-  LayoutStride layout;
-  src.stride(layout.stride);
   layout.dimension[0] = src.dimension_0();
   layout.dimension[1] = src.dimension_1();
   layout.dimension[2] = src.dimension_2();
@@ -1807,37 +2197,91 @@ create_mirror( const View<T,L,D,M,S> & src )
   layout.dimension[6] = src.dimension_6();
   layout.dimension[7] = src.dimension_7();
 
-  return host_view_type( label , layout );
+  layout.stride[0] = src.stride_0();
+  layout.stride[1] = src.stride_1();
+  layout.stride[2] = src.stride_2();
+  layout.stride[3] = src.stride_3();
+  layout.stride[4] = src.stride_4();
+  layout.stride[5] = src.stride_5();
+  layout.stride[6] = src.stride_6();
+  layout.stride[7] = src.stride_7();
+
+  return dst_type( std::string( src.label() ).append("_mirror") , layout );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::Experimental::View<T,P...> & src) {
+  return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
 }
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed &&
-    Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
-  ), typename View<T,L,D,M,S>::HostMirror >::type
+
+template< class T , class ... P >
 inline
-create_mirror_view( const View<T,L,D,M,S> & src )
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::Experimental::View<T,P...> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
 {
   return src ;
 }
 
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed &&
-    ! Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
-  ), typename View<T,L,D,M,S>::HostMirror >::type
+template< class T , class ... P >
 inline
-create_mirror_view( const View<T,L,D,M,S> & src )
+typename Kokkos::Experimental::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::Experimental::View<T,P...> & src
+                  , typename std::enable_if< ! (
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::memory_space
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::Experimental::View<T,P...>::data_type
+                                  , typename Kokkos::Experimental::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
 {
-  return create_mirror( src );
+  return Kokkos::Experimental::create_mirror( src );
+}
+
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::Experimental::View<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
 }
 
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+namespace Kokkos {
+namespace Experimental {
+
 /** \brief  Resize a view with copying old data to new data at the corresponding indices. */
-template< class T , class L , class D , class M , class S >
+template< class T , class ... P >
 inline
-void resize( View<T,L,D,M,S> & v ,
-             const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
+void resize( Kokkos::Experimental::View<T,P...> & v ,
+             const size_t n0 = 0 ,
              const size_t n1 = 0 ,
              const size_t n2 = 0 ,
              const size_t n3 = 0 ,
@@ -1846,24 +2290,22 @@ void resize( View<T,L,D,M,S> & v ,
              const size_t n6 = 0 ,
              const size_t n7 = 0 )
 {
-  typedef View<T,L,D,M,S> view_type ;
+  typedef Kokkos::Experimental::View<T,P...>  view_type ;
 
-  const std::string label = v.tracker().label();
+  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
 
-  view_type v_resized( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+  view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
 
-  Impl::ViewRemap< view_type , view_type >( v_resized , v );
-
-  view_type::execution_space::fence();
+  Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v );
 
   v = v_resized ;
 }
 
-/** \brief  Reallocate a view without copying old data to new data */
-template< class T , class L , class D , class M , class S >
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class ... P >
 inline
-void realloc( View<T,L,D,M,S> & v ,
-              const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
+void realloc( Kokkos::Experimental::View<T,P...> & v ,
+              const size_t n0 = 0 ,
               const size_t n1 = 0 ,
               const size_t n2 = 0 ,
               const size_t n3 = 0 ,
@@ -1872,239 +2314,71 @@ void realloc( View<T,L,D,M,S> & v ,
               const size_t n6 = 0 ,
               const size_t n7 = 0 )
 {
-  typedef View<T,L,D,M,S> view_type ;
+  typedef Kokkos::Experimental::View<T,P...>  view_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,P...>::is_managed , "Can only realloc managed views" );
 
-  // Query the current label and reuse it.
-  const std::string label = v.tracker().label();
+  const std::string label = v.label();
 
-  v = view_type(); // deallocate first, if the only view to memory.
+  v = view_type(); // Deallocate first, if the only view to allocation
   v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
 }
 
-} // namespace Kokkos
+} /* namespace Experimental */
+} /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                          , ArgType4 , ArgType5 , ArgType6 , ArgType7
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 ,
-         const ArgType5 & arg5 ,
-         const ArgType6 & arg6 ,
-         const ArgType7 & arg7 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                 , ArgType4 , ArgType5 , ArgType6 , ArgType7
-                 >::type
-      DstViewType ;
+template< class D , class ... P >
+using ViewTraits = Kokkos::Experimental::ViewTraits<D,P...> ;
 
-  return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 );
-}
+using Experimental::View ; //modified due to gcc parser bug 
+//template< class D , class ... P >
+//using View = Kokkos::Experimental::View<D,P...> ;
 
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 , class ArgType5 , class ArgType6 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                          , ArgType4 , ArgType5 , ArgType6 , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 ,
-         const ArgType5 & arg5 ,
-         const ArgType6 & arg6 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                 , ArgType4 , ArgType5 , ArgType6 , void
-                 >::type
-      DstViewType ;
-
-  return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6 );
-}
+using Kokkos::Experimental::ALL ;
+using Kokkos::Experimental::WithoutInitializing ;
+using Kokkos::Experimental::AllowPadding ;
+using Kokkos::Experimental::view_alloc ;
+using Kokkos::Experimental::view_wrap ;
 
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 , class ArgType5 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                          , ArgType4 , ArgType5 , void , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 ,
-         const ArgType5 & arg5 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                 , ArgType4 , ArgType5 , void , void
-                 >::type
-      DstViewType ;
-
-  return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5 );
-}
-
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                          , ArgType4 , void , void , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                 , ArgType4 , void , void , void
-                 >::type
-      DstViewType ;
-
-  return DstViewType( src, arg0, arg1, arg2, arg3, arg4 );
-}
-
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                          , void , void , void , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
-                 , void , void , void , void
-                 >::type
-      DstViewType ;
-
-  return DstViewType( src, arg0, arg1, arg2, arg3 );
-}
-
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , ArgType2 , void
-                          , void , void , void , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , ArgType2 , void
-                 , void , void , void , void
-                 >::type
-      DstViewType ;
+using Kokkos::Experimental::deep_copy ;
+using Kokkos::Experimental::create_mirror ;
+using Kokkos::Experimental::create_mirror_view ;
+using Kokkos::Experimental::subview ;
+using Kokkos::Experimental::resize ;
+using Kokkos::Experimental::realloc ;
+using Kokkos::Experimental::is_view ;
 
-  return DstViewType( src, arg0, arg1, arg2 );
-}
+namespace Impl {
 
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 , class ArgType1 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , ArgType1 , void , void
-                          , void , void , void , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , ArgType1 , void , void
-                 , void , void , void , void
-                 >::type
-      DstViewType ;
+using Kokkos::Experimental::is_view ;
 
-  return DstViewType( src, arg0, arg1 );
-}
+class ViewDefault {};
 
-template< class D , class A1 , class A2 , class A3 , class S ,
-          class ArgType0 >
-KOKKOS_INLINE_FUNCTION
-typename Impl::ViewSubview< View<D,A1,A2,A3,S>
-                          , ArgType0 , void , void , void
-                          , void , void , void , void
-                          >::type
-subview( const View<D,A1,A2,A3,S> & src ,
-         const ArgType0 & arg0 )
-{
-  typedef typename
-    Impl::ViewSubview< View<D,A1,A2,A3,S>
-                 , ArgType0 , void , void , void
-                 , void , void , void , void
-                 >::type
-      DstViewType ;
+template< class SrcViewType
+        , class Arg0Type
+        , class Arg1Type
+        , class Arg2Type
+        , class Arg3Type
+        , class Arg4Type
+        , class Arg5Type
+        , class Arg6Type
+        , class Arg7Type
+        >
+struct ViewSubview /* { typedef ... type ; } */ ;
 
-  return DstViewType( src, arg0 );
 }
 
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+} /* namespace Kokkos */
 
-#include <impl/Kokkos_ViewDefault.hpp>
 #include <impl/Kokkos_Atomic_View.hpp>
 
-#include <impl/Kokkos_ViewOffset.hpp>
-#include <impl/Kokkos_ViewSupport.hpp>
-
-namespace Kokkos {
-/** \brief  Tag denoting that a subview should capture all of a dimension */
-struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} };
-}
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#include <KokkosExp_View.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
+#endif /* #ifndef KOKKOS_VIEW_HPP */
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
index c58706bbaa..27ae5803ce 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -178,9 +178,10 @@ public:
 namespace Kokkos {
 namespace Impl {
 
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ...>
+                    , ReducerType
                     , Kokkos::OpenMP
                     >
 {
@@ -192,15 +193,21 @@ private:
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
+  const ReducerType   m_reducer ;
   const pointer_type  m_result_ptr ;
 
   template< class TagType >
@@ -252,7 +259,7 @@ public:
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
 
 #pragma omp parallel
       {
@@ -260,7 +267,7 @@ public:
         const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
         ParallelReduce::template exec_range< WorkTag >
           ( m_functor , range.begin() , range.end()
-          , ValueInit::init( m_functor , exec.scratch_reduce() ) );
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
       }
 /* END #pragma omp parallel */
 
@@ -269,13 +276,13 @@ public:
       const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
 
       for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
       }
 
-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( m_functor );
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
 
         for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
       }
@@ -289,7 +296,7 @@ public:
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
 
 #pragma omp parallel
       {
@@ -302,7 +309,7 @@ public:
 
         long work_index = exec.get_work_index();
 
-        reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() );
+        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
         while(work_index != -1) {
           const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
           const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
@@ -319,13 +326,13 @@ public:
       const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
 
       for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
       }
 
-      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( m_functor , ptr );
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( m_functor );
+        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
 
         for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
       }
@@ -337,18 +344,35 @@ public:
   inline
   ParallelReduce( const FunctorType & arg_functor
                 , Policy       arg_policy
-                , const ViewType    & arg_result_view )
+                , const ViewType    & arg_result_view
+                , typename std::enable_if<
+                           Kokkos::is_view< ViewType >::value &&
+                           !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
-    , m_result_ptr(  arg_result_view.ptr_on_device() )
+    , m_reducer( InvalidType() )
+    , m_result_ptr(  arg_result_view.data() )
     {
-      static_assert( Kokkos::is_view< ViewType >::value
-        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
 
-      static_assert( std::is_same< typename ViewType::memory_space
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
                                       , Kokkos::HostSpace >::value
-        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
     }
+
 };
 
 } // namespace Impl
@@ -568,13 +592,13 @@ public:
 
       const size_t team_reduce_size = Policy::member_type::team_reduce_size();
 
-      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
 
 #pragma omp parallel
       {
         ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
           ( m_functor
-          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
+          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
       }
 /* END #pragma omp parallel */
     }
@@ -584,14 +608,15 @@ public:
                const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
     {}
 };
 
 
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                     , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                     , Kokkos::OpenMP
                     >
 {
@@ -602,15 +627,19 @@ private:
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
-  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
+  const ReducerType  m_reducer ;
   const pointer_type m_result_ptr ;
   const int          m_shmem_size ;
 
@@ -644,7 +673,7 @@ public:
 
       const size_t team_reduce_size = Policy::member_type::team_reduce_size();
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
 
 #pragma omp parallel
       {
@@ -652,8 +681,8 @@ public:
 
         ParallelReduce::template exec_team< WorkTag >
           ( m_functor
-          , Member( exec , m_policy , m_shmem_size )
-          , ValueInit::init( m_functor , exec.scratch_reduce() ) );
+          , Member( exec , m_policy , m_shmem_size, 0 )
+          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
       }
 /* END #pragma omp parallel */
 
@@ -665,13 +694,13 @@ public:
           max_active_threads = m_policy.league_size()* m_policy.team_size();
 
         for ( int i = 1 ; i < max_active_threads ; ++i ) {
-          ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
         }
 
-        Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
+        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
 
         if ( m_result_ptr ) {
-          const int n = ValueTraits::value_count( m_functor );
+          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
 
           for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
         }
@@ -682,12 +711,33 @@ public:
   inline
   ParallelReduce( const FunctorType  & arg_functor ,
                   const Policy       & arg_policy ,
-                  const ViewType     & arg_result )
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
     {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
+
 };
 
 } // namespace Impl
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
new file mode 100644
index 0000000000..3e22033f7c
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -0,0 +1,329 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::OpenMP > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::OpenMP >::
+TaskExec()
+  : m_self_exec( 0 )
+  , m_team_exec( 0 )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( 0 )
+  , m_team_rank( 0 )
+  , m_team_size( 1 )
+{
+}
+
+TaskExec< Kokkos::OpenMP >::
+TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
+  : m_self_exec( & arg_exec )
+  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
+  , m_sync_mask( 0 )
+  , m_sync_value( 0 )
+  , m_sync_step( 0 )
+  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
+  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
+  , m_team_size(  arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
+{
+  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
+  }
+
+  // Use team shared memory to synchronize.
+  // Alternate memory locations between barriers to avoid a sequence
+  // of barriers overtaking one another.
+
+  int64_t volatile * const sync =
+    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+  // This team member sets one byte within the sync variable
+  int8_t volatile * const sync_self =
+   ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+  while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout
+       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
+       , m_group_rank
+       , m_team_rank
+       , m_sync_step
+       , m_sync_value
+       , *sync
+       );
+fflush(stdout);
+#endif
+
+  ++m_sync_step ;
+
+  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+    m_sync_value ^= m_sync_mask ;
+    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::execute
+  ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using PoolExec        = Kokkos::Impl::OpenMPexec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self , team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire OpenMP thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      task_root_type * task = 0 ;
+
+      // Each team lead attempts to acquire either a thread team task
+      // or a single thread task for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+            task = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+      }
+
+      // Team lead broadcast acquired task to team members:
+
+      if ( 1 < team_exec.team_size() ) {
+
+        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
+
+        // Fence to be sure task_shared is stored before the barrier
+        Kokkos::memory_fence();
+
+        // Whole team waits for every team member to reach this statement
+        team_exec.team_barrier();
+
+        // Fence to be sure task_shared is stored
+        Kokkos::memory_fence();
+
+        task = *task_shared ;
+      }
+
+#if 0
+fprintf( stdout
+       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
+       , team_exec.m_group_rank
+       , team_exec.m_team_rank
+       , uintptr_t(task_shared)
+       , uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        // All team members wait for whole team to reach this statement.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task , & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task ); 
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task , & single_exec );
+
+          queue->complete( task ); 
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::OpenMP >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::OpenMP > * const queue )
+{
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task , & single_exec );
+
+      queue->complete( task ); 
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
new file mode 100644
index 0000000000..2761247c40
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
+#define KOKKOS_IMPL_OPENMP_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::OpenMP >
+{
+public:
+
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::OpenMP
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::OpenMP > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::OpenMP >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::OpenMPexec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec , int arg_team_size );
+
+  void team_barrier_impl() const ;
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
+TeamThreadRange
+  ( Impl::TaskExec< Kokkos::OpenMP > & thread
+  , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
+TeamThreadRange
+  ( Impl:: TaskExec< Kokkos::OpenMP > & thread
+  , const iType & start
+  , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
+}
+
+/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.
+*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
+  , const Lambda& lambda
+  )
+{
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i);
+  }
+}
+
+template<typename iType, class Lambda, typename ValueType>
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
+  , const Lambda& lambda
+  , ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        shared[0] += shared[i];
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i, result);
+  }
+
+  if ( 1 < loop_boundaries.thread.team_size() ) {
+    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+
+    loop_boundaries.thread.team_barrier();
+    shared[team_rank] = result;
+
+    loop_boundaries.thread.team_barrier();
+
+    // reduce across threads to thread 0
+    if (team_rank == 0) {
+      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
+        join(shared[0], shared[i]);
+      }
+    }
+
+    loop_boundaries.thread.team_barrier();
+
+    // broadcast result
+    initialized_result = shared[0];
+  }
+  else {
+    initialized_result = result ;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
+  int team_size = loop_boundaries.thread.team_size();
+  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
+
+  // Intra-member scan
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+  shared[team_rank] = accum;
+  loop_boundaries.thread.team_barrier();
+
+  // Member 0 do scan on accumulated totals
+  if (team_rank == 0) {
+    for( iType i = 1; i < team_size; i+=1) {
+      shared[i] += shared[i-1];
+    }
+    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
+  }
+
+  loop_boundaries.thread.team_barrier();
+
+  // Inter-member scan adding in accumulated totals
+  if (team_rank != 0) { accum = shared[team_rank-1]; }
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
index f73f1e932a..7d06a2f661 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@@ -49,6 +49,7 @@
 #include <impl/Kokkos_Error.hpp>
 #include <iostream>
 #include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
 
 #ifdef KOKKOS_HAVE_OPENMP
 
@@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 
 int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-OpenMPexec::Pool OpenMPexec::m_pool;
-
-#else
-
 OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 
-#endif
-
 void OpenMPexec::verify_is_process( const char * const label )
 {
   if ( omp_in_parallel() ) {
@@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch()
 #pragma omp parallel
   {
     const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-#if KOKKOS_USING_EXP_VIEW
     typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
     if ( m_pool[ rank_rev ] ) {
       Record * const r = Record::get_record( m_pool[ rank_rev ] );
       m_pool[ rank_rev ] = 0 ;
       Record::decrement( r );
     }
-#else
-    m_pool.at(rank_rev).clear();
-#endif
   }
 /* END #pragma omp parallel */
 }
@@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
       const int rank_rev = m_map_rank[ omp_get_thread_num() ];
       const int rank     = pool_size - ( rank_rev + 1 );
 
-#if KOKKOS_USING_EXP_VIEW
-
       typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
 
       Record * const r = Record::allocate( Kokkos::HostSpace()
@@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
 
       m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
 
-#else
-
-      #pragma omp critical
-      {
-        m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
-      }
-
-#endif
-
       new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
     }
 /* END #pragma omp parallel */
@@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count ,
   }
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
 }
 
 //----------------------------------------------------------------------------
@@ -350,6 +332,10 @@ void OpenMP::finalize()
   if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
     hwloc::unbind_this_thread();
   }
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
index 723b2f9429..a01c9cb644 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@@ -46,7 +46,6 @@
 
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_spinwait.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
 
 #include <Kokkos_Atomic.hpp>
 #include <iostream>
@@ -63,38 +62,10 @@ public:
 
   enum { MAX_THREAD_COUNT = 4096 };
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-  struct Pool
-  {
-    Pool() : m_trackers() {}
-
-    AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
-
-    OpenMPexec * operator[](int i)
-    {
-      return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
-    }
-
-    AllocationTracker & at(int i)
-    {
-      return m_trackers[i];
-    }
-  };
-
-
-private:
-
-  static Pool         m_pool; // Indexed by: m_pool_rank_rev
-
-#else
-
 private:
 
   static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
 
-#endif
-
   static int          m_pool_topo[ 4 ];
   static int          m_map_rank[ MAX_THREAD_COUNT ];
 
@@ -145,6 +116,12 @@ public:
 
   inline long team_work_index() const { return m_team_work_index ; }
 
+  inline int scratch_reduce_size() const
+    { return m_scratch_reduce_end - m_scratch_exec_end ; }
+
+  inline int scratch_thread_size() const
+    { return m_scratch_thread_end - m_scratch_reduce_end ; }
+
   inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
   inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
 
@@ -157,15 +134,15 @@ public:
 
   ~OpenMPexec() {}
 
-  OpenMPexec( const int poolRank
-            , const int scratch_exec_size
-            , const int scratch_reduce_size
-            , const int scratch_thread_size )
-    : m_pool_rank( poolRank )
-    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
-    , m_scratch_exec_end( scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
+  OpenMPexec( const int arg_poolRank
+            , const int arg_scratch_exec_size
+            , const int arg_scratch_reduce_size
+            , const int arg_scratch_thread_size )
+    : m_pool_rank( arg_poolRank )
+    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
+    , m_scratch_exec_end( arg_scratch_exec_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
     , m_barrier_state(0)
     {}
 
@@ -330,7 +307,7 @@ public:
 
   Impl::OpenMPexec    & m_exec ;
   scratch_memory_space  m_team_shared ;
-  int                   m_team_shmem ;
+  int                   m_team_scratch_size[2] ;
   int                   m_team_base_rev ;
   int                   m_team_rank_rev ;
   int                   m_team_rank ;
@@ -378,15 +355,15 @@ public:
 
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
 
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
 
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
 
   KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
   KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@@ -568,11 +545,12 @@ public:
   inline
   OpenMPexecTeamMember( Impl::OpenMPexec & exec
                       , const TeamPolicyInternal< OpenMP, Properties ...> & team
-                      , const int shmem_size
+                      , const int shmem_size_L1
+                      , const int shmem_size_L2
                       )
     : m_exec( exec )
     , m_team_shared(0,0)
-    , m_team_shmem( shmem_size )
+    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
     , m_team_base_rev(0)
     , m_team_rank_rev(0)
     , m_team_rank(0)
@@ -580,7 +558,7 @@ public:
     , m_league_rank(0)
     , m_league_end(0)
     , m_league_size( team.league_size() )
-    , m_chunk_size( team.chunk_size() )
+    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
     , m_league_chunk_end(0)
     , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
     , m_team_alloc( team.team_alloc())
@@ -589,10 +567,9 @@ public:
       const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
       const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
       const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
-      const int chunk_size           = team.chunk_size()>0?team.chunk_size():team.team_iter();
-      const int chunks_per_team      = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams);
-            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size;
-            int league_iter_begin    = league_iter_end - chunks_per_team * chunk_size;
+      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
+            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
+            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
       if (league_iter_begin < 0)     league_iter_begin = 0;
       if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
 
@@ -611,7 +588,9 @@ public:
         m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
         m_league_end     = league_iter_end ;
         m_league_rank    = league_iter_begin ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                               0 );
       }
 
       if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
@@ -627,10 +606,13 @@ public:
 
   void next_static()
     {
-      if ( ++m_league_rank < m_league_end ) {
+      if ( m_league_rank < m_league_end ) {
         team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                               0);
       }
+      m_league_rank++;
     }
 
   bool valid_dynamic() {
@@ -661,10 +643,13 @@ public:
     if(m_invalid_thread)
       return;
 
-    team_barrier();
-    if ( ++m_league_rank < m_league_chunk_end ) {
-      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+    if ( m_league_rank < m_league_chunk_end ) {
+      team_barrier();
+      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
+                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
+                                             0);
     }
+    m_league_rank++;
   }
 
   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
@@ -687,8 +672,10 @@ public:
     m_team_size = p.m_team_size;
     m_team_alloc = p.m_team_alloc;
     m_team_iter = p.m_team_iter;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
     m_chunk_size = p.m_chunk_size;
     return *this;
   }
@@ -719,8 +706,8 @@ private:
   int m_team_alloc ;
   int m_team_iter ;
 
-  size_t m_team_scratch_size;
-  size_t m_thread_scratch_size;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
 
   int m_chunk_size;
 
@@ -753,15 +740,19 @@ public:
 
   inline int team_size()   const { return m_team_size ; }
   inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
 
   /** \brief  Specify league size, request team size */
   TeamPolicyInternal( typename traits::execution_space &
             , int league_size_request
             , int team_size_request
             , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
             , m_chunk_size(0)
     { init( league_size_request , team_size_request ); }
 
@@ -769,24 +760,24 @@ public:
             , int league_size_request
             , const Kokkos::AUTO_t & /* team_size_request */
             , int /* vector_length_request */ = 1)
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
             , m_chunk_size(0)
     { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
 
   TeamPolicyInternal( int league_size_request
             , int team_size_request
             , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
             , m_chunk_size(0)
     { init( league_size_request , team_size_request ); }
 
   TeamPolicyInternal( int league_size_request
             , const Kokkos::AUTO_t & /* team_size_request */
             , int /* vector_length_request */ = 1 )
-            : m_team_scratch_size ( 0 )
-            , m_thread_scratch_size ( 0 )
+            : m_team_scratch_size { 0 , 0 }
+            , m_thread_scratch_size { 0 , 0 }
             , m_chunk_size(0)
     { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
 
@@ -803,24 +794,21 @@ public:
   }
 
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
     return p;
   };
 
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
index 92c5b97b9a..3123a297c4 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@@ -104,7 +104,7 @@ namespace Kokkos {
 
 int Qthread::is_initialized()
 {
-  Impl::s_number_workers != 0 ;
+  return Impl::s_number_workers != 0 ;
 }
 
 int Qthread::concurrency()
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
index a1f533b232..f948eb2903 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -113,7 +113,7 @@ public:
         m_worker_state = QthreadExec::Inactive ;
         Impl::spinwait( m_worker_state , QthreadExec::Inactive );
       }
-    
+
       for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
         m_worker_base[j]->m_worker_state = QthreadExec::Active ;
       }
@@ -136,7 +136,7 @@ public:
           m_worker_state = QthreadExec::Inactive ;
           Impl::spinwait( m_worker_state , QthreadExec::Inactive );
         }
-    
+
         for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
           m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
         }
@@ -145,11 +145,13 @@ public:
 
   //----------------------------------------
   /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
+  template< class FunctorType , class ReducerType , class ArgTag >
   inline
-  void exec_all_reduce( const FunctorType & func ) const
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
     {
-      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
+      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+      typedef typename ReducerConditional::type ReducerTypeFwd;
+      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
 
       const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
 
@@ -160,14 +162,14 @@ public:
 
         Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
 
-        ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
+        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
       }
 
       if ( rev_rank ) {
         m_worker_state = QthreadExec::Inactive ;
         Impl::spinwait( m_worker_state , QthreadExec::Inactive );
       }
-    
+
       for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
         m_worker_base[j]->m_worker_state = QthreadExec::Active ;
       }
@@ -197,7 +199,7 @@ public:
       }
       else {
         // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_worker_base[0] is the 
+        // Worker data is in reverse order, so m_worker_base[0] is the
         // highest ranking thread.
 
         // Copy from lower ranking to higher ranking worker.
@@ -216,7 +218,7 @@ public:
           ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
         }
       }
-    
+
       for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
         m_worker_base[j]->m_worker_state = QthreadExec::Active ;
       }
@@ -349,7 +351,7 @@ public:
       }
       else {
         // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_shepherd_base[0] is the 
+        // Worker data is in reverse order, so m_shepherd_base[0] is the
         // highest ranking thread.
 
         // Copy from lower ranking to higher ranking worker.
@@ -371,7 +373,7 @@ public:
 
         memory_fence();
       }
-    
+
       for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
         m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
       }
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
index 2e3cdce562..5b6419289f 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -130,9 +130,10 @@ public:
 
 //----------------------------------------------------------------------------
 
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                     , Kokkos::Qthread
                     >
 {
@@ -141,18 +142,24 @@ private:
   typedef Kokkos::RangePolicy< Traits ... >  Policy ;
 
   typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
   typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  // Static Assert WorkTag void if ReducerType not InvalidType
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
-  const FunctorType  m_functor ;
-  const Policy       m_policy ;
-  const pointer_type m_result_ptr ;
+  const FunctorType   m_functor ;
+  const Policy        m_policy ;
+  const ReducerType   m_reducer ;
+  const pointer_type  m_result_ptr ;
 
   template< class TagType >
   inline static
@@ -187,9 +194,10 @@ private:
 
     ParallelReduce::template exec_range< WorkTag >(
       self.m_functor, range.begin(), range.end(),
-      ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
+      ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
+                     , exec.exec_all_reduce_value() ) );
 
-    exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
   }
 
 public:
@@ -197,26 +205,39 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
       Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
 
       const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
 
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
 
       if ( m_result_ptr ) {
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
         for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
       }
     }
 
-  template< class HostViewType >
+  template< class ViewType >
   ParallelReduce( const FunctorType  & arg_functor
                 , const Policy       & arg_policy
-                , const HostViewType & arg_result_view )
+                , const ViewType & arg_result_view
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
     : m_functor( arg_functor )
-    , m_policy(  arg_policy )
-    , m_result_ptr( arg_result_view.ptr_on_device() )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
+    , m_result_ptr( arg_result_view.data() )
+    { }
+
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy( arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr( reducer.result_view().data() )
     { }
 };
 
@@ -291,10 +312,12 @@ public:
 
 //----------------------------------------------------------------------------
 
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                     , TeamPolicy< Properties... >
-                    , Kokkos::Qthread >
+                    , ReducerType
+                    , Kokkos::Qthread
+                    >
 {
 private:
 
@@ -303,14 +326,18 @@ private:
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
+  const ReducerType  m_reducer ;
   const pointer_type m_result_ptr ;
 
   template< class TagType >
@@ -345,9 +372,10 @@ private:
     ParallelReduce::template exec_team< WorkTag >
       ( self.m_functor
       , Member( exec , self.m_policy )
-      , ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
+      , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
+                       , exec.exec_all_reduce_value() ) );
 
-    exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
   }
 
 public:
@@ -356,29 +384,43 @@ public:
   void execute() const
     {
       QthreadExec::resize_worker_scratch
-        ( /* reduction   memory */ ValueTraits::value_size( m_functor )
+        ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
         , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
 
       Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
 
       const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
 
-      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
 
       if ( m_result_ptr ) {
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
         for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
       }
     }
 
   template< class ViewType >
-  ParallelReduce( const FunctorType & arg_functor ,
-                  const Policy      & arg_policy ,
-                  const ViewType    & arg_result )
+  ParallelReduce( const FunctorType & arg_functor
+                , const Policy      & arg_policy
+                , const ViewType    & arg_result
+                , typename std::enable_if<Kokkos::is_view< ViewType >::value &&
+                                          !Kokkos::is_reducer_type< ReducerType >::value
+                                          , void*>::type = NULL)
     : m_functor( arg_functor )
-    , m_policy(  arg_policy )
+    , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
     { }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy( arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr( reducer.result_view().data() )
+  { }
 };
 
 //----------------------------------------------------------------------------
@@ -395,8 +437,8 @@ private:
   typedef Kokkos::RangePolicy< Traits ... >  Policy ;
 
   typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
   typedef typename Policy::WorkRange    WorkRange ;
+  typedef typename Policy::member_type  Member ;
 
   typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
index 0765072030..8cc39d277c 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@@ -58,6 +58,8 @@
 #include <Kokkos_Atomic.hpp>
 #include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
 
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -120,13 +122,13 @@ Task::~TaskMember()
 }
 
 
-Task::TaskMember( const function_verify_type        arg_verify
-                , const function_dealloc_type       arg_dealloc
-                , const function_apply_single_type  arg_apply_single
-                , const function_apply_team_type    arg_apply_team
-                , volatile int &                    arg_active_count
-                , const unsigned                    arg_sizeof_derived
-                , const unsigned                    arg_dependence_capacity
+Task::TaskMember( const function_verify_type   arg_verify
+                , const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
                 )
   : m_dealloc( arg_dealloc )
   , m_verify(  arg_verify )
@@ -144,12 +146,12 @@ Task::TaskMember( const function_verify_type        arg_verify
   for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
 }
 
-Task::TaskMember( const function_dealloc_type       arg_dealloc
-                , const function_apply_single_type  arg_apply_single
-                , const function_apply_team_type    arg_apply_team
-                , volatile int &                    arg_active_count
-                , const unsigned                    arg_sizeof_derived
-                , const unsigned                    arg_dependence_capacity
+Task::TaskMember( const function_dealloc_type  arg_dealloc
+                , const function_single_type   arg_apply_single
+                , const function_team_type     arg_apply_team
+                , volatile int &               arg_active_count
+                , const unsigned               arg_sizeof_derived
+                , const unsigned               arg_dependence_capacity
                 )
   : m_dealloc( arg_dealloc )
   , m_verify(  & Task::verify_type<void> )
@@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg )
                                         , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
                                         );
 
-  // It is a single thread's responsibility to close out
-  // this task's execution.
-  bool close_out = false ;
-
   if ( task->m_apply_team && ! task->m_apply_single ) {
-    const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
 
     // Initialize team size and rank with shephered info
     Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
@@ -344,7 +342,7 @@ fflush(stdout);
     if ( member.team_rank() == 0 ) task->closeout();
     member.team_barrier();
   }
-  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
+  else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
     // Team hard-wired to one, no cloning
     Kokkos::Impl::QthreadTeamPolicyMember member ;
     (*task->m_apply_team)( task , member );
@@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
 } // namespace Experimental
 } // namespace Kokkos
 
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
 
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
index 118f13d9f1..22a565503d 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@@ -69,6 +69,8 @@
 
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -80,24 +82,24 @@ class TaskMember< Kokkos::Qthread , void , void >
 {
 public:
 
-  typedef void         (* function_apply_single_type) ( TaskMember * );
-  typedef void         (* function_apply_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
-  typedef void         (* function_dealloc_type)( TaskMember * );
   typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_dealloc_type)( TaskMember * );
 
 private:
 
-  const function_dealloc_type       m_dealloc ;       ///< Deallocation
-  const function_verify_type        m_verify ;        ///< Result type verification
-  const function_apply_single_type  m_apply_single ;  ///< Apply function
-  const function_apply_team_type    m_apply_team ;    ///< Apply function
-  int volatile * const              m_active_count ;  ///< Count of active tasks on this policy
-  aligned_t                         m_qfeb ;          ///< Qthread full/empty bit
-  TaskMember ** const               m_dep ;           ///< Dependences
-  const int                         m_dep_capacity ;  ///< Capacity of dependences
-  int                               m_dep_size ;      ///< Actual count of dependences
-  int                               m_ref_count ;     ///< Reference count
-  int                               m_state ;         ///< State of the task
+  const function_dealloc_type  m_dealloc ;       ///< Deallocation
+  const function_verify_type   m_verify ;        ///< Result type verification
+  const function_single_type   m_apply_single ;  ///< Apply function
+  const function_team_type     m_apply_team ;    ///< Apply function
+  int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
+  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
+  TaskMember ** const          m_dep ;           ///< Dependences
+  const int                    m_dep_capacity ;  ///< Capacity of dependences
+  int                          m_dep_size ;      ///< Actual count of dependences
+  int                          m_ref_count ;     ///< Reference count
+  int                          m_state ;         ///< State of the task
 
   TaskMember() /* = delete */ ;
   TaskMember( const TaskMember & ) /* = delete */ ;
@@ -128,22 +130,22 @@ protected :
   ~TaskMember();
 
   // Used by TaskMember< Qthread , ResultType , void >
-  TaskMember( const function_verify_type        arg_verify
-            , const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
             );
 
   // Used for TaskMember< Qthread , void , void >
-  TaskMember( const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
             );
 
 public:
@@ -221,7 +223,7 @@ public:
       typedef typename DerivedTaskType::functor_type  functor_type ;
       typedef typename functor_type::value_type       value_type ;
 
-      const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 );
+      const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
 
       DerivedTaskType * const task =
         new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
@@ -379,16 +381,16 @@ protected:
 
   typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
   typedef task_root_type::function_dealloc_type        function_dealloc_type ;
-  typedef task_root_type::function_apply_single_type   function_apply_single_type ;
-  typedef task_root_type::function_apply_team_type     function_apply_team_type ;
+  typedef task_root_type::function_single_type         function_single_type ;
+  typedef task_root_type::function_team_type           function_team_type ;
 
   inline
-  TaskMember( const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
             )
     : task_root_type( & task_root_type::template verify_type< ResultType >
                     , arg_dealloc
@@ -413,17 +415,17 @@ public:
   typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
   typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
   typedef task_root_type::function_dealloc_type              function_dealloc_type ;
-  typedef task_root_type::function_apply_single_type         function_apply_single_type ;
-  typedef task_root_type::function_apply_team_type           function_apply_team_type ;
+  typedef task_root_type::function_single_type               function_single_type ;
+  typedef task_root_type::function_team_type                 function_team_type ;
 
   inline
-  TaskMember( const function_dealloc_type       arg_dealloc
-            , const function_apply_single_type  arg_apply_single
-            , const function_apply_team_type    arg_apply_team
-            , volatile int &                    arg_active_count
-            , const unsigned                    arg_sizeof_derived
-            , const unsigned                    arg_dependence_capacity
-            , const functor_type &              arg_functor
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_single_type   arg_apply_single
+            , const function_team_type     arg_apply_team
+            , volatile int &               arg_active_count
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
             )
     : task_base_type( arg_dealloc
                     , arg_apply_single
@@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread >
 public:
 
   typedef Kokkos::Qthread                        execution_space ;
+  typedef TaskPolicy                             execution_policy ;
   typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
 
 private:
@@ -489,14 +492,17 @@ public:
     , const unsigned arg_task_team_size = 0 /* choose default */
     );
 
-  TaskPolicy() = default ;
-  TaskPolicy( TaskPolicy && rhs ) = default ;
-  TaskPolicy( const TaskPolicy & rhs ) = default ;
-  TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
-  TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy() = default ;
+  KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
+  KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
 
   //----------------------------------------
 
+  KOKKOS_INLINE_FUNCTION
+  int allocated_task_count() const { return m_active_count ; }
+
   template< class ValueType >
   const Future< ValueType , execution_space > &
     spawn( const Future< ValueType , execution_space > & f 
@@ -653,5 +659,6 @@ public:
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #define KOKKOS_QTHREAD_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthread/README
index 5d8f29a4ee..6e6c86a9ef 100644
--- a/lib/kokkos/core/src/Qthread/README
+++ b/lib/kokkos/core/src/Qthread/README
@@ -3,26 +3,23 @@
 
 # Cloning repository and branch:
 
-git clone https://github.com/stelleg/qthreads qthreads-with-clone
+git clone git@github.com:Qthreads/qthreads.git qthreads
 
-cd qthreads-with-clone
+cd qthreads
 
-# Added to ./git/config
-#
-# [branch "cloned_tasks"]
-#        remote = origin
-#        merge = refs/heads/cloned_tasks
-#
+# checkout branch with "cloned tasks"
 
-git branch cloned_tasks
-git checkout cloned_tasks
-git pull
+git checkout dev-kokkos
+
+# Configure/autogen
 
 sh autogen.sh
 
-# configurure with 'hwloc' installation:
+# configure with 'hwloc' installation:
 
 ./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
 
+# install
 
+make install
 
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index e1d3fe06e6..5f0b8f70cd 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -53,6 +53,7 @@
 #include <Kokkos_Core.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_CPUDiscovery.hpp>
+#include <impl/Kokkos_Profiling_Interface.hpp>
 
 
 //----------------------------------------------------------------------------
@@ -134,11 +135,7 @@ void ThreadsExec::driver(void)
 
 ThreadsExec::ThreadsExec()
   : m_pool_base(0)
-#if ! KOKKOS_USING_EXP_VIEW
-  , m_scratch()
-#else
   , m_scratch(0)
-#endif
   , m_scratch_reduce_end(0)
   , m_scratch_thread_end(0)
   , m_numa_rank(0)
@@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec()
 {
   const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
 
-#if KOKKOS_USING_EXP_VIEW
-
   typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
 
   if ( m_scratch ) {
@@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec()
     Record::decrement( r );
   }
 
-#else
-
-  m_scratch.clear();
-
-#endif
-
   m_pool_base   = 0 ;
   m_scratch_reduce_end = 0 ;
   m_scratch_thread_end = 0 ;
@@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch()
 
 void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
 {
-#if KOKKOS_USING_EXP_VIEW
-
   typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
 
   if ( exec.m_scratch ) {
@@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
     Record::decrement( r );
   }
 
-#else
-
-  exec.m_scratch.clear();
-
-#endif
-
   exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
   exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
 
   if ( s_threads_process.m_scratch_thread_end ) {
 
-#if KOKKOS_USING_EXP_VIEW
-
     // Allocate tracked memory:
     {
       Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
@@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
 
     unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
 
-#else
-
-    exec.m_scratch =
-      HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
-
-    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
-
-#endif
-
     unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
 
     // touch on this thread
@@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
     s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
   }
 
-#if KOKKOS_USING_EXP_VIEW
   return s_threads_process.m_scratch ;
-#else
-  return s_threads_process.m_scratch.alloc_ptr() ;
-#endif
 }
 
 //----------------------------------------------------------------------------
@@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count ,
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
 
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
 }
 
 //----------------------------------------------------------------------------
@@ -807,6 +776,10 @@ void ThreadsExec::finalize()
   s_threads_process.m_pool_size       = 1 ;
   s_threads_process.m_pool_fan_size   = 0 ;
   s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+  #if (KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index b2019aaf77..4ec1450d0f 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -49,7 +49,6 @@
 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
 
 #include <Kokkos_Atomic.hpp>
 
@@ -89,11 +88,7 @@ private:
 
   ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
 
-#if ! KOKKOS_USING_EXP_VIEW
-  Impl::AllocationTracker m_scratch ;
-#else
   void *        m_scratch ;
-#endif
   int           m_scratch_reduce_end ;
   int           m_scratch_thread_end ;
   int           m_numa_rank ;
@@ -138,19 +133,10 @@ public:
   static int get_thread_count();
   static ThreadsExec * get_thread( const int init_thread_rank );
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-  inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
-  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
-
-#else
-
   inline void * reduce_memory() const { return m_scratch ; }
   KOKKOS_INLINE_FUNCTION  void * scratch_memory() const
     { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
 
-#endif
-
   KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
   KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }
 
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index b425ac4773..3407ffaa54 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -129,15 +129,15 @@ public:
 
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space & team_shmem() const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
 
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space & team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(1,0) ; }
+    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
 
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space & thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; }
+    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
 
   KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
   KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@@ -433,10 +433,11 @@ public:
 
   void next_static()
     {
-      if ( ++m_league_rank < m_league_end ) {
+      if ( m_league_rank < m_league_end ) {
         team_barrier();
         set_team_shared();
       }
+      m_league_rank++;
     }
 
   bool valid_dynamic() {
@@ -468,10 +469,11 @@ public:
     if(m_invalid_thread)
       return;
 
-    team_barrier();
-    if ( ++m_league_rank < m_league_chunk_end ) {
+    if ( m_league_rank < m_league_chunk_end ) {
+      team_barrier();
       set_team_shared();
     }
+    m_league_rank++;
   }
 
   void set_league_shmem( const int arg_league_rank
@@ -504,8 +506,8 @@ private:
   int m_team_alloc ;
   int m_team_iter ;
 
-  size_t m_team_scratch_size;
-  size_t m_thread_scratch_size;
+  size_t m_team_scratch_size[2];
+  size_t m_thread_scratch_size[2];
 
   int m_chunk_size;
 
@@ -549,8 +551,10 @@ public:
     m_team_size = p.m_team_size;
     m_team_alloc = p.m_team_alloc;
     m_team_iter = p.m_team_iter;
-    m_team_scratch_size = p.m_team_scratch_size;
-    m_thread_scratch_size = p.m_thread_scratch_size;
+    m_team_scratch_size[0] = p.m_team_scratch_size[0];
+    m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+    m_team_scratch_size[1] = p.m_team_scratch_size[1];
+    m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
     m_chunk_size = p.m_chunk_size;
     return *this;
   }
@@ -577,7 +581,12 @@ public:
   inline int team_size() const { return m_team_size ; }
   inline int team_alloc() const { return m_team_alloc ; }
   inline int league_size() const { return m_league_size ; }
-  inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; }
+  inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
+    if(team_size_ < 0)
+      team_size_ = m_team_size;
+    return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
+  }
+
   inline int team_iter() const { return m_team_iter ; }
 
   /** \brief  Specify league size, request team size */
@@ -588,8 +597,8 @@ public:
     : m_league_size(0)
     , m_team_size(0)
     , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_chunk_size(0)
     { init(league_size_request,team_size_request); (void) vector_length_request; }
 
@@ -601,8 +610,8 @@ public:
     : m_league_size(0)
     , m_team_size(0)
     , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_chunk_size(0)
     { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
 
@@ -612,8 +621,8 @@ public:
     : m_league_size(0)
     , m_team_size(0)
     , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_chunk_size(0)
     { init(league_size_request,team_size_request); }
 
@@ -623,8 +632,8 @@ public:
     : m_league_size(0)
     , m_team_size(0)
     , m_team_alloc(0)
-    , m_team_scratch_size ( 0 )
-    , m_thread_scratch_size ( 0 )
+    , m_team_scratch_size { 0 , 0 }
+    , m_thread_scratch_size { 0 , 0 }
     , m_chunk_size(0)
     { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
 
@@ -639,26 +648,23 @@ public:
 
   /** \brief set per team scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
+    p.m_team_scratch_size[level] = per_team.value;
     return p;
   };
 
   /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
   /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
   inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
-    (void) level;
     TeamPolicyInternal p = *this;
-    p.m_team_scratch_size = per_team.value;
-    p.m_thread_scratch_size = per_thread.value;
+    p.m_team_scratch_size[level] = per_team.value;
+    p.m_thread_scratch_size[level] = per_thread.value;
     return p;
   };
 
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
index 55ddecf87f..1aba00c94b 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@@ -264,7 +264,7 @@ public:
              , const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
     { }
 };
 
@@ -272,9 +272,10 @@ public:
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and RangePolicy */
 
-template< class FunctorType , class ... Traits >
+template< class FunctorType , class ReducerType, class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ... >
+                    , ReducerType
                     , Kokkos::Threads
                     >
 {
@@ -286,14 +287,18 @@ private:
   typedef typename Policy::WorkRange   WorkRange ;
   typedef typename Policy::member_type Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
+  const ReducerType   m_reducer ;
   const pointer_type m_result_ptr ;
 
   template< class TagType >
@@ -344,9 +349,9 @@ private:
 
     ParallelReduce::template exec_range< WorkTag >
       ( self.m_functor , range.begin() , range.end() 
-      , ValueInit::init( self.m_functor , exec.reduce_memory() ) );
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
 
-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
   }
 
   template<class Schedule>
@@ -362,7 +367,7 @@ private:
     exec.barrier();
 
     long work_index = exec.get_work_index();
-    reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() );
+    reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
     while(work_index != -1) {
       const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
       const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
@@ -372,7 +377,7 @@ private:
       work_index = exec.get_work_index();
     }
 
-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
   }
 
 public:
@@ -380,7 +385,7 @@ public:
   inline
   void execute() const
     {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
 
       ThreadsExec::start( & ParallelReduce::exec , this );
 
@@ -391,7 +396,7 @@ public:
         const pointer_type data =
           (pointer_type) ThreadsExec::root_reduce_scratch();
 
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
         for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
       }
     }
@@ -399,9 +404,14 @@ public:
   template< class HostViewType >
   ParallelReduce( const FunctorType  & arg_functor ,
                   const Policy       & arg_policy ,
-                  const HostViewType & arg_result_view )
+                  const HostViewType & arg_result_view ,
+                  typename std::enable_if<
+                               Kokkos::is_view< HostViewType >::value &&
+                              !Kokkos::is_reducer_type<ReducerType>::value
+                  ,void*>::type = NULL)
     : m_functor( arg_functor )
     , m_policy( arg_policy )
+    , m_reducer( InvalidType() )
     , m_result_ptr( arg_result_view.ptr_on_device() )
     {
       static_assert( Kokkos::is_view< HostViewType >::value
@@ -410,14 +420,30 @@ public:
       static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
         , "Kokkos::Threads reduce result must be a View in HostSpace" );
     }
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_policy(  arg_policy )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    {
+      /*static_assert( std::is_same< typename ViewType::memory_space
+                                      , Kokkos::HostSpace >::value
+        , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+    }
+
 };
 
 //----------------------------------------------------------------------------
 /* ParallelReduce with Kokkos::Threads and TeamPolicy */
 
-template< class FunctorType , class ... Properties >
+template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
                     , Kokkos::TeamPolicy< Properties ... >
+                    , ReducerType
                     , Kokkos::Threads
                     >
 {
@@ -426,14 +452,19 @@ private:
   typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... >              Policy ;
   typedef typename Policy::work_tag                                WorkTag ;
   typedef typename Policy::member_type                             Member ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
-  typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
+  typedef typename ReducerConditional::type ReducerTypeFwd;
+
+  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
 
   typedef typename ValueTraits::pointer_type    pointer_type ;
   typedef typename ValueTraits::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
+  const ReducerType  m_reducer ;
   const pointer_type m_result_ptr ;
   const int          m_shared ;
 
@@ -464,9 +495,9 @@ private:
 
     ParallelReduce::template exec_team< WorkTag >
       ( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
-      , ValueInit::init( self.m_functor , exec.reduce_memory() ) );
+      , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
 
-    exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
+    exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
   }
 
 public:
@@ -474,7 +505,7 @@ public:
   inline
   void execute() const
     {
-      ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
+      ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
 
       ThreadsExec::start( & ParallelReduce::exec , this );
 
@@ -484,20 +515,41 @@ public:
 
         const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
 
-        const unsigned n = ValueTraits::value_count( m_functor );
+        const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
         for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
       }
     }
 
   template< class ViewType >
-  ParallelReduce( const FunctorType & arg_functor
-                , const Policy      & arg_policy
-                , const ViewType    & arg_result )
+  inline
+  ParallelReduce( const FunctorType  & arg_functor ,
+                  const Policy       & arg_policy ,
+                  const ViewType     & arg_result ,
+                  typename std::enable_if<
+                    Kokkos::is_view< ViewType >::value &&
+                    !Kokkos::is_reducer_type<ReducerType>::value
+                    ,void*>::type = NULL)
     : m_functor( arg_functor )
-    , m_policy( arg_policy )
+    , m_policy(  arg_policy )
+    , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
-    { }
+    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    {}
+
+  inline
+  ParallelReduce( const FunctorType & arg_functor
+    , Policy       arg_policy
+    , const ReducerType& reducer )
+  : m_functor( arg_functor )
+  , m_policy(  arg_policy )
+  , m_reducer( reducer )
+  , m_result_ptr(  reducer.result_view().data() )
+  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  {
+  /*static_assert( std::is_same< typename ViewType::memory_space
+                          , Kokkos::HostSpace >::value
+  , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
+  }
 };
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
index 258e683a4f..e1599284b2 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
@@ -46,9 +46,10 @@
 #include <stdio.h>
 #include <iostream>
 #include <sstream>
+#include <Kokkos_Core.hpp>
 #include <Threads/Kokkos_Threads_TaskPolicy.hpp>
 
-#if defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 
 #define QLOCK   (reinterpret_cast<void*>( ~((uintptr_t)0) ))
 #define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
@@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
   , const unsigned arg_task_team_size
   )
   : m_space( Kokkos::Threads::memory_space()
-           , arg_task_max_size
-           , arg_task_max_size * arg_task_max_count
-           , 1 /* only one level of memory pool */
+           , arg_task_max_size * arg_task_max_count * 1.2
+           , 16 /* log2(superblock size) */
            )
   , m_team { 0 , 0 , 0 }
   , m_serial { 0 , 0 , 0 }
@@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task
   // User created task memory pool with an estimate,
   // if estimate is to low then report and throw exception.
 
-  if ( m_space.get_min_chunk_size() < size_alloc ) {
+  if ( m_space.get_min_block_size() < size_alloc ) {
     fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
            , int(size_alloc)
-           , int(m_space.get_min_chunk_size())
+           , int(m_space.get_min_block_size())
            );
     fflush(stderr);
     Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
@@ -926,5 +926,5 @@ void Task::clear_dependence()
 } /* namespace Experimental */
 } /* namespace Kokkos */
 
-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
index a0c28afd0c..116d32e4fc 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
@@ -50,7 +50,7 @@
 #include <Kokkos_Threads.hpp>
 #include <Kokkos_TaskPolicy.hpp>
 
-#if defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
 
 //----------------------------------------------------------------------------
 
@@ -737,10 +737,9 @@ public:
 } /* namespace Experimental */
 } /* namespace Kokkos */
 
-#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
-
 //----------------------------------------------------------------------------
 
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */
 
 
diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
index feb3632d43..1498eafb00 100644
--- a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
@@ -246,8 +246,8 @@ private:
   enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
 
   // The allocation record resides in Host memory space
-  Record  * m_record ;
   uintptr_t m_record_bits ;
+  Record  * m_record ;
 
 public:
 
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
index 9ace88dfb1..6525fed0a5 100644
--- a/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewCtor.hpp
@@ -47,8 +47,6 @@
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#if KOKKOS_USING_EXP_VIEW
-
 namespace Kokkos {
 
 /* For backward compatibility */
@@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing {
 
 } /* namespace Kokkos */
 
-#endif
-
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
index 39339185e7..ed56536cd9 100644
--- a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
@@ -2604,18 +2604,24 @@ class ViewMapping< DstTraits , SrcTraits ,
     &&
     std::is_same< typename DstTraits::specialize , void >::value
     &&
-    (
-      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
-      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
-      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
-    )
-    &&
     std::is_same< typename SrcTraits::specialize , void >::value
     &&
     (
-      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
-      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
-      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+      std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
+      ||
+      (
+        (
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+        &&
+        (
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+          std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+        )
+      )
     )
   )>::type >
 {
diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
deleted file mode 100644
index c95557793a..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
+++ /dev/null
@@ -1,848 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-
-#include <Kokkos_Atomic.hpp>
-
-#include <impl/Kokkos_Singleton.hpp>
-#include <impl/Kokkos_AllocationTracker.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-
-#include <string>
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include <utility>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <iomanip>
-
-/* Enable clean up of memory leaks */
-#define CLEAN_UP_MEMORY_LEAKS 0
-
-namespace Kokkos { namespace Impl {
-
-namespace {
-
-
-//-----------------------------------------------------------------------------
-// AllocationRecord
-//-----------------------------------------------------------------------------
-//
-// Used to track details about an allocation and provide a ref count
-// sizeof(AllocationRecord) == 128
-struct AllocationRecord
-{
-  enum {
-     OFFSET = sizeof(AllocatorBase*)          // allocator
-            + sizeof(void*)                   // alloc_ptr
-            + sizeof(uint64_t)                // alloc_size
-            + sizeof(AllocatorAttributeBase*) // attribute
-            + sizeof(uint32_t)                // node_index
-            + sizeof(uint32_t)                // ref_count
-   , LABEL_LENGTH = 128 - OFFSET
-  };
-
-  AllocatorBase * const          allocator;
-  void * const                   alloc_ptr;
-  const uint64_t                 alloc_size;
-  AllocatorAttributeBase * const attribute;
-  const int32_t                  node_index;
-  volatile uint32_t              ref_count;
-  const char                     label[LABEL_LENGTH];
-
-
-  AllocationRecord(  AllocatorBase * const arg_allocator
-                   , void *   arg_alloc_ptr
-                   , uint64_t arg_alloc_size
-                   , int32_t  arg_node_index
-                   , const std::string & arg_label
-                  )
-    : allocator(arg_allocator)
-    , alloc_ptr(arg_alloc_ptr)
-    , alloc_size(arg_alloc_size)
-    , attribute(NULL)
-    , node_index(arg_node_index)
-    , ref_count(1)
-    , label() // zero fill
-  {
-    const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
-    strncpy( const_cast<char *>(label), arg_label.c_str(), length );
-  }
-
-  ~AllocationRecord()
-  {
-    if (attribute) {
-      delete attribute;
-    }
-  }
-
-  uint32_t increment_ref_count()
-  {
-    uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
-    return old_value + 1u;
-  }
-
-  uint32_t decrement_ref_count()
-  {
-    uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
-    return old_value - 1u;
-  }
-
-  void print( std::ostream & oss ) const
-  {
-    oss << "{ " << allocator->name()
-        << " } : \"" << label
-        << "\" ref_count(" << ref_count
-        << ") memory[ " << alloc_ptr
-        << " + " << alloc_size
-        << " ]" ;
-  }
-
-  bool set_attribute( AllocatorAttributeBase * attr )
-  {
-    bool result = false;
-    if (attribute == NULL) {
-      result = NULL == atomic_compare_exchange(  const_cast<AllocatorAttributeBase **>(&attribute)
-                                               , reinterpret_cast<AllocatorAttributeBase *>(NULL)
-                                               , attr );
-    }
-
-    return result;
-  }
-
-  // disallow copy and assignment
-  AllocationRecord( const AllocationRecord & );
-  AllocationRecord & operator=(const AllocationRecord &);
-};
-
-template <int NumBlocks>
-struct Bitset
-{
-  enum { blocks = NumBlocks };
-  enum { size = blocks * 64 };
-  enum { block_mask = 63u };
-  enum { block_shift = 6 };
-
-  // used to find free bits in a bitset
-  static int count_trailing_zeros(uint64_t x)
-  {
-    #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
-      return x ? __builtin_ctzll(x) : 64;
-    #elif defined( KOKKOS_COMPILER_INTEL )
-      enum { shift = 32 };
-      enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
-      return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
-             (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
-             64 ;
-    #elif defined( KOKKOS_COMPILER_IBM )
-      return x ? __cnttz8(x) : 64;
-    #else
-      int i = 0;
-      for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
-      return i;
-    #endif
-  }
-
-  Bitset()
-    : m_bits()
-  {
-    for (int i=0; i < blocks; ++i) {
-      m_bits[i] = 0u;
-    }
-  }
-
-  bool set( int i )
-  {
-    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
-    return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
-  }
-
-  bool reset( int i )
-  {
-    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
-    return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
-  }
-
-  bool test( int i )
-  {
-    const uint64_t block = m_bits[ i >> block_shift ];
-    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
-    return block & bit;
-  }
-
-  int find_first_unset() const
-  {
-    for (int i=0; i < blocks; ++i) {
-      const uint64_t block = m_bits[i];
-      int b = count_trailing_zeros( ~block );
-
-      if ( b < 64 ) {
-        return (i << block_shift) + b;
-      }
-    }
-    return size;
-  }
-
-  volatile uint64_t m_bits[blocks];
-};
-
-//-----------------------------------------------------------------------------
-// AllocationRecordPool -- singleton class
-//
-// global_alloc_rec_pool is the ONLY instance of this class
-//
-//-----------------------------------------------------------------------------
-// Record AllocationRecords in a lock-free circular list.
-// Each node in the list has a buffer with space for 959 ((15*64)-1) records
-// managed by a bitset.  Atomics are used to set and reset bits in the bit set.
-// The head of the list is atomically updated to the last node found with
-// unused space.
-//
-// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
-// Cost to destroy an allocation recored: O(1)
-//
-// Singleton allocations are pushed onto a lock-free stack that is destroyed
-// after the circular list of allocation records.
-struct AllocationRecordPool
-{
-  enum { BITSET_BLOCKS = 15 };
-
-  typedef Bitset<BITSET_BLOCKS> bitset_type;
-
-  enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
-
-  struct AllocationNode
-  {
-    AllocationNode()
-      : next()
-      , bitset()
-      , buffer()
-    {
-      // set the first bit to used
-      bitset.set(0);
-    }
-
-    void * get_buffer( int32_t node_index )
-    {
-      return buffer + (node_index-1) * sizeof(AllocationRecord);
-    }
-
-    // return 0 if no space is available in the node
-    int32_t get_node_index()
-    {
-      int32_t node_index = 0;
-      do {
-        node_index = bitset.find_first_unset();
-
-        // successfully claimed a bit
-        if ( node_index != bitset.size && bitset.set(node_index) )
-        {
-          return node_index;
-        }
-      } while ( node_index != bitset.size );
-      return 0;
-    }
-
-    void clear_node_index( int32_t node_index )
-    {
-      bitset.reset(node_index);
-    }
-
-    AllocationNode * next;
-    bitset_type      bitset;
-    char             buffer[BUFFER_SIZE];
-  };
-
-  struct SingletonNode
-  {
-    void * buffer;
-    SingletonNode * next;
-    Impl::singleton_destroy_function_type destroy;
-
-    SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func  )
-      : buffer(NULL)
-      , next(NULL)
-      , destroy(destroy_func)
-    {
-      if (size) {
-        buffer = malloc(size);
-        create_func(buffer);
-      }
-    }
-
-    ~SingletonNode()
-    {
-      if (buffer) {
-        try {
-          destroy(buffer);
-        } catch(...) {}
-        free(buffer);
-      }
-    }
-  };
-
-  AllocationRecordPool()
-    : head( new AllocationNode() )
-    , singleton_head(NULL)
-  {
-    // setup ring
-    head->next = head;
-  }
-
-  ~AllocationRecordPool()
-  {
-    // delete allocation records
-    {
-      AllocationNode * start = head;
-
-      AllocationNode * curr = start;
-
-      std::vector< std::string > string_vec;
-
-      do {
-        AllocationNode * next = curr->next;
-
-        #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
-        // print node bitset
-        for (int i=0; i < bitset_type::blocks; ++i ) {
-          std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << "   ";
-        }
-        std::cout << std::endl;
-        #endif
-
-        // bit zero does not map to an AllocationRecord
-        for ( int32_t i=1; i < bitset_type::size; ++i )
-        {
-          if (curr->bitset.test(i)) {
-            AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
-
-            std::ostringstream oss;
-            alloc_rec->print( oss );
-            string_vec.push_back( oss.str() );
-
-#if CLEAN_UP_MEMORY_LEAKS
-/* Cleaning up memory leaks prevents memory error detection tools
- * from reporting the original source of allocation, which can
- * impede debugging with such tools.
- */
-            try {
-              destroy(alloc_rec);
-            }
-            catch(...) {}
-#endif
-          }
-        }
-
-        curr->next = NULL;
-
-        delete curr;
-
-        curr = next;
-      } while ( curr != start );
-
-      //if ( !string_vec.empty() ) {
-      //  std::sort( string_vec.begin(), string_vec.end() );
-      //
-      //  std::ostringstream oss;
-      //  oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
-      //  for (size_t i=0; i< string_vec.size(); ++i)
-      //  {
-      //    oss << "   " << string_vec[i] << std::endl;
-      //  }
-      //
-      //  std::cerr << oss.str() << std::endl;
-      //}
-    }
-
-    // delete singletons
-    {
-      SingletonNode * curr = singleton_head;
-
-      while (curr) {
-        SingletonNode * next = curr->next;
-        delete curr;
-        curr = next;
-      }
-    }
-  }
-
-  AllocationRecord * create(  AllocatorBase * arg_allocator
-                            , void * arg_alloc_ptr
-                            , size_t arg_alloc_size
-                            , const std::string & arg_label
-                           )
-  {
-    AllocationNode * start = volatile_load(&head);
-
-    AllocationNode * curr = start;
-
-
-    int32_t node_index = curr->get_node_index();
-
-    if (node_index == 0) {
-      curr = volatile_load(&curr->next);
-    }
-
-    while (node_index == 0 && curr != start)
-    {
-      node_index = curr->get_node_index();
-      if (node_index == 0) {
-        curr = volatile_load(&curr->next);
-      }
-    }
-
-    // Need to allocate and insert a new node
-    if (node_index == 0 && curr == start)
-    {
-      AllocationNode * new_node = new AllocationNode();
-
-      node_index = new_node->get_node_index();
-
-      AllocationNode * next = NULL;
-      do {
-        next = volatile_load(&curr->next);
-        new_node->next = next;
-        memory_fence();
-      } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
-
-      curr = new_node;
-    }
-
-    void * buffer = curr->get_buffer(node_index);
-
-    // try to set head to curr
-    if ( start != curr )
-    {
-      atomic_compare_exchange( & head, start, curr );
-    }
-
-    return new (buffer) AllocationRecord(  arg_allocator
-                                         , arg_alloc_ptr
-                                         , arg_alloc_size
-                                         , node_index
-                                         , arg_label
-                                        );
-  }
-
-  void destroy( AllocationRecord * alloc_rec )
-  {
-    if (alloc_rec) {
-      const int32_t node_index = alloc_rec->node_index;
-      AllocationNode * node = get_node( alloc_rec );
-
-      // deallocate memory
-      alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
-
-      // call destructor
-      alloc_rec->~AllocationRecord();
-
-      // wait for writes to complete
-      memory_fence();
-
-      // clear node index
-      node->clear_node_index( node_index );
-    }
-  }
-
-  void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
-  {
-    SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
-    SingletonNode * next;
-
-    // insert new node at the head of the list
-    do {
-      next = volatile_load(&singleton_head);
-      node->next = next;
-    } while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
-
-    return node->buffer;
-  }
-
-  void print_memory( std::ostream & out ) const
-  {
-    AllocationNode * start = head;
-
-    AllocationNode * curr = start;
-
-    std::vector< std::string > string_vec;
-
-    do {
-      AllocationNode * next = curr->next;
-
-      // bit zero does not map to an AllocationRecord
-      for ( int32_t i=1; i < bitset_type::size; ++i )
-      {
-        if (curr->bitset.test(i)) {
-          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
-
-          std::ostringstream oss;
-          alloc_rec->print( oss );
-          string_vec.push_back( oss.str() );
-        }
-      }
-      curr = next;
-    } while ( curr != start );
-
-    if ( !string_vec.empty() ) {
-      std::sort( string_vec.begin(), string_vec.end() );
-
-      std::ostringstream oss;
-      oss << "Tracked Memory:" << std::endl;
-      for (size_t i=0; i< string_vec.size(); ++i)
-      {
-        oss << "   " << string_vec[i] << std::endl;
-      }
-      out << oss.str() << std::endl;
-    }
-    else {
-      out << "No Tracked Memory" << std::endl;
-    }
-  }
-
-  // find an AllocationRecord such that
-  // alloc_ptr <= ptr < alloc_ptr + alloc_size
-  // otherwise return NULL
-  AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
-  {
-    AllocationNode * start = head;
-
-    AllocationNode * curr = start;
-
-    char const * const char_ptr = reinterpret_cast<const char *>(ptr);
-
-    do {
-      AllocationNode * next = curr->next;
-
-      // bit zero does not map to an AllocationRecord
-      for ( int32_t i=1; i < bitset_type::size; ++i )
-      {
-        if (curr->bitset.test(i)) {
-          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
-
-          char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
-
-          if (   (allocator == alloc_rec->allocator)
-              && (alloc_ptr <= char_ptr)
-              && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
-          {
-            return alloc_rec;
-          }
-        }
-      }
-      curr = next;
-    } while ( curr != start );
-
-    return NULL;
-  }
-
-private:
-
-  AllocationNode * get_node( AllocationRecord * alloc_rec )
-  {
-    return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
-  }
-
-  AllocationNode * head;
-  SingletonNode * singleton_head;
-};
-
-// create the global pool for allocation records
-AllocationRecordPool global_alloc_rec_pool;
-
-
-
-// convert a uintptr_t to an AllocationRecord pointer
-inline
-AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
-{
-  return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
-}
-
-} // unnamed namespace
-
-//-----------------------------------------------------------------------------
-// Allocation Tracker methods
-//-----------------------------------------------------------------------------
-
-// Create a reference counted AllocationTracker
-void AllocationTracker::initalize(  AllocatorBase * arg_allocator
-                                  , void * arg_alloc_ptr
-                                  , size_t arg_alloc_size
-                                  , const std::string & arg_label
-                                 )
-{
-  if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
-    // create record
-    AllocationRecord * alloc_rec = global_alloc_rec_pool.create(  arg_allocator
-                                                                , arg_alloc_ptr
-                                                                , arg_alloc_size
-                                                                , arg_label
-                                                               );
-
-    m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
-  }
-}
-
-void AllocationTracker::reallocate( size_t size ) const
-{
-  AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
-
-  void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
-
-  if ( NULL != the_alloc_ptr )
-  {
-    *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
-    *const_cast<uint64_t *>(&rec->alloc_size) = size;
-  }
-  else {
-    Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
-  }
-}
-
-
-void AllocationTracker::increment_ref_count() const
-{
-  to_alloc_rec( m_alloc_rec )->increment_ref_count();
-}
-
-
-void AllocationTracker::decrement_ref_count() const
-{
-  AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
-  uint32_t the_ref_count = alloc_rec->decrement_ref_count();
-  if (the_ref_count == 0u) {
-    try {
-      global_alloc_rec_pool.destroy( alloc_rec );
-    }
-    catch(...) {}
-  }
-}
-
-namespace {
-
-struct NullAllocator { static const char * name() { return "Null Allocator"; } };
-
-}
-
-AllocatorBase * AllocationTracker::allocator() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->allocator;
-  }
-  return Allocator<NullAllocator>::singleton();
-}
-
-void * AllocationTracker::alloc_ptr()  const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->alloc_ptr;
-  }
-  return NULL;
-}
-
-size_t AllocationTracker::alloc_size() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->alloc_size;
-  }
-  return 0u;
-}
-
-size_t AllocationTracker::ref_count()  const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->ref_count;
-  }
-  return 0u;
-}
-
-char const * AllocationTracker::label() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->label;
-  }
-  return "[Empty Allocation Tracker]";
-}
-
-void AllocationTracker::print( std::ostream & oss) const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    to_alloc_rec(m_alloc_rec)->print(oss);
-  }
-  else {
-    oss << label();
-  }
-}
-
-bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
-{
-  bool result = false;
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
-  }
-  return result;
-}
-
-AllocatorAttributeBase * AllocationTracker::attribute() const
-{
-  if (m_alloc_rec & REF_COUNT_MASK) {
-    return to_alloc_rec(m_alloc_rec)->attribute;
-  }
-  return NULL;
-}
-
-void AllocationTracker::print_tracked_memory( std::ostream & out )
-{
-  global_alloc_rec_pool.print_memory( out );
-}
-
-
-AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
-{
-  AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
-
-  AllocationTracker tracker;
-
-  if ( alloc_rec != NULL )
-  {
-    if ( tracking_enabled() ) {
-      alloc_rec->increment_ref_count();
-      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
-    }
-    else {
-      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
-    }
-  }
-
-  return tracker ;
-}
-
-
-
-//-----------------------------------------------------------------------------
-// static AllocationTracker
-//-----------------------------------------------------------------------------
-#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
-namespace {
-
-  // TODO : Detect compiler support for thread local variables
-  #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
-    bool g_thread_local_tracking_enabled = true;
-    #pragma omp threadprivate(g_thread_local_tracking_enabled)
-  #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
-    __thread bool g_thread_local_tracking_enabled = true;
-  #elif defined( KOKKOS_HAVE_OPENMP )
-    bool g_thread_local_tracking_enabled = true;
-    #pragma omp threadprivate(g_thread_local_tracking_enabled)
-  #elif defined( KOKKOS_HAVE_PTHREAD )
-    __thread bool g_thread_local_tracking_enabled = true;
-  #elif defined( KOKKOS_HAVE_SERIAL )
-      bool g_thread_local_tracking_enabled = true;
-  #endif
-} // unnamed namespace
-
-void AllocationTracker::disable_tracking()
-{
-  g_thread_local_tracking_enabled = false;
-}
-
-void AllocationTracker::enable_tracking()
-{
-  g_thread_local_tracking_enabled = true;
-}
-
-bool AllocationTracker::tracking_enabled()
-{
-  return g_thread_local_tracking_enabled;
-}
-#else
-namespace {
-enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
-volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
-}
-
-void AllocationTracker::disable_tracking()
-{
-  if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
-    Impl::throw_runtime_exception("Error: Tracking already disabled");
-  }
-}
-
-void AllocationTracker::enable_tracking()
-{
-  if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
-    Impl::throw_runtime_exception("Error: Tracking already enabled");
-  }
-}
-
-bool AllocationTracker::tracking_enabled()
-{
-  return g_tracking_enabled == TRACKING_ENABLED;
-}
-#endif
-
-
-//-----------------------------------------------------------------------------
-// create singleton free function
-//-----------------------------------------------------------------------------
-void * create_singleton(  size_t size
-                        , Impl::singleton_create_function_type create_func
-                        , Impl::singleton_destroy_function_type destroy_func )
-{
-  return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
-}
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
deleted file mode 100644
index 738a9d7908..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
+++ /dev/null
@@ -1,574 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
-#define KOKKOS_ALLOCATION_TRACKER_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-#include <stdint.h>
-#include <cstdlib>
-#include <string>
-#include <iosfwd>
-
-namespace Kokkos { namespace Impl {
-
-//-----------------------------------------------------------------------------
-// Create Singleton objects
-//-----------------------------------------------------------------------------
-
-typedef void * (*singleton_create_function_type)(void * buffer);
-typedef void (*singleton_destroy_function_type)(void *);
-
-void * create_singleton(  size_t size
-                        , singleton_create_function_type create_func
-                        , singleton_destroy_function_type destroy_func
-                       );
-
-
-
-/// class Singleton
-///
-/// Default construct a singleton type.  This method is used to circumvent
-/// order of construction issues.  Singleton objects are destroyed after all
-/// other allocations in the reverse order of their creation.
-template <typename Type>
-class Singleton
-{
-public:
-  /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
-  static Type * get()
-  {
-    static Type * singleton = NULL;
-    if (singleton == NULL) {
-      Impl::singleton_create_function_type  create_func = &create;
-      Impl::singleton_destroy_function_type destroy_func = &destroy;
-      singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
-    }
-    return singleton;
-  }
-
-private:
-
-  /// Call the Type constructor
-  static void destroy(void * ptr)
-  {
-    reinterpret_cast<Type*>(ptr)->~Type();
-  }
-
-  /// placement new the Type in buffer
-  static void * create(void * buffer)
-  {
-    return new (buffer) Type();
-  }
-};
-
-
-//-----------------------------------------------------------------------------
-// AllocatorBase
-//-----------------------------------------------------------------------------
-
-/// class AllocatorBase
-///
-/// Abstract base class for all Allocators.
-/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
-/// to avoid order of destruction issues
-class AllocatorBase
-{
-public:
-  /// name of the allocator
-  /// used to report memory leaks
-  virtual const char * name() const = 0;
-
-  /// Allocate a buffer of size number of bytes
-  virtual void* allocate(size_t size) const = 0;
-
-  /// Deallocate a buffer with size number of bytes
-  /// The pointer must have been allocated with a call to corresponding allocate
-  virtual void deallocate(void * ptr, size_t size) const = 0;
-
-  /// Changes the size of the memory block pointed to by ptr.
-  /// Ptr must have been allocated with the corresponding allocate call
-  /// The function may move the memory block to a new location
-  /// (whose address is returned by the function).
-  ///
-  /// The content of the memory block is preserved up to the lesser of the new and
-  /// old sizes, even if the block is moved to a new location. If the new size is larger,
-  /// the value of the newly allocated portion is indeterminate.
-  ///
-  /// In case that ptr is a null pointer, the function behaves like allocate, assigning a
-  /// new block of size bytes and returning a pointer to its beginning.
-  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
-
-  /// can a texture object be bound to the allocated memory
-  virtual bool support_texture_binding() const = 0;
-
-  /// virtual destructor
-  virtual ~AllocatorBase() {}
-};
-
-/// class AllocatorAttributeBase
-class AllocatorAttributeBase
-{
-public:
-  virtual ~AllocatorAttributeBase() {}
-};
-
-//-----------------------------------------------------------------------------
-// Allocator< StaticAllocator > : public AllocatorBase
-//-----------------------------------------------------------------------------
-
-// HasStaticName
-template<typename T>
-class HasStaticName
-{
-  typedef const char * (*static_method)();
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::name>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-
-template <typename T>
-inline
-typename enable_if<HasStaticName<T>::value, const char *>::type
-allocator_name()
-{
-  return T::name();
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticName<T>::value, const char *>::type
-allocator_name()
-{
-  return "Unnamed Allocator";
-}
-
-
-// HasStaticAllocate
-template<typename T>
-class HasStaticAllocate
-{
-  typedef void * (*static_method)(size_t);
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::allocate>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticAllocate<T>::value, void *>::type
-allocator_allocate(size_t size)
-{
-  return T::allocate(size);
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticAllocate<T>::value, void *>::type
-allocator_allocate(size_t)
-{
-  throw_runtime_exception(  std::string("Error: ")
-                          + std::string(allocator_name<T>())
-                          + std::string(" cannot allocate memory!") );
-  return NULL;
-}
-
-// HasStaticDeallocate
-template<typename T>
-class HasStaticDeallocate
-{
-  typedef void (*static_method)(void *, size_t);
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticDeallocate<T>::value, void>::type
-allocator_deallocate(void * ptr, size_t size)
-{
-  T::deallocate(ptr,size);
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticDeallocate<T>::value, void>::type
-allocator_deallocate(void *, size_t)
-{
-  throw_runtime_exception(  std::string("Error: ")
-                          + std::string(allocator_name<T>())
-                          + std::string(" cannot deallocate memory!") );
-}
-
-// HasStaticReallocate
-template<typename T>
-class HasStaticReallocate
-{
-  typedef void * (*static_method)(void *, size_t, size_t);
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticReallocate<T>::value, void *>::type
-allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  return T::reallocate(old_ptr, old_size, new_size);
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticReallocate<T>::value, void *>::type
-allocator_reallocate(void *, size_t, size_t)
-{
-  throw_runtime_exception(  std::string("Error: ")
-                          + std::string(allocator_name<T>())
-                          + std::string(" cannot reallocate memory!") );
-  return NULL;
-}
-
-// HasStaticReallocate
-template<typename T>
-class HasStaticSupportTextureBinding
-{
-  typedef bool (*static_method)();
-  template<typename U, static_method> struct SFINAE {};
-  template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
-  template<typename U> static int Test(...);
-public:
-  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
-};
-
-template <typename T>
-inline
-typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
-allocator_support_texture_binding()
-{
-  return T::support_texture_binding();
-}
-
-template <typename T>
-inline
-typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
-allocator_support_texture_binding()
-{
-  return false;
-}
-
-template <typename T>
-class Allocator : public AllocatorBase
-{
-public:
-  virtual const char * name() const
-  {
-    return allocator_name<T>();
-  }
-
-  virtual void* allocate(size_t size) const
-  {
-    return allocator_allocate<T>(size);
-  }
-
-  virtual void deallocate(void * ptr, size_t size) const
-  {
-    allocator_deallocate<T>(ptr,size);
-  }
-
-  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
-  {
-    return allocator_reallocate<T>(old_ptr, old_size, new_size);
-  }
-
-  virtual bool support_texture_binding() const
-  {
-    return allocator_support_texture_binding<T>();
-  }
-
-  static AllocatorBase * singleton()
-  {
-    return Singleton< Allocator<T> >::get();
-  }
-};
-
-//-----------------------------------------------------------------------------
-// AllocationTracker
-//-----------------------------------------------------------------------------
-
-// forward declaration for friend classes
-struct MallocHelper;
-
-/// class AllocationTracker
-/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
-/// Reference counting is disabled when the host is in parallel.
-class AllocationTracker
-{
-  // use the least significant bit of the AllocationRecord pointer to indicate if the
-  // AllocationTracker should reference count
-  enum {
-     REF_COUNT_BIT = static_cast<uintptr_t>(1)
-   , REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
-  };
-
-public:
-
-  /// Find an AllocationTracker such that
-  /// alloc_ptr <= ptr < alloc_ptr + alloc_size
-  /// O(n) where n is the number of tracked allocations.
-  template <typename StaticAllocator>
-  static AllocationTracker find( void const * ptr )
-  {
-    return find( ptr, Allocator<StaticAllocator>::singleton() );
-  }
-
-
-  /// Pretty print all the currently tracked memory
-  static void print_tracked_memory( std::ostream & out );
-
-  /// Default constructor
-  KOKKOS_INLINE_FUNCTION
-  AllocationTracker()
-    : m_alloc_rec(0)
-  {}
-
-  /// Create a AllocationTracker
-  ///
-  /// Start reference counting the alloc_ptr.
-  /// When the reference count reachs 0 the allocator deallocate method
-  /// will be call with the given size.  The alloc_ptr should have been
-  /// allocated with the allocator's allocate method.
-  ///
-  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
-  /// do nothing
-  template <typename StaticAllocator>
-  AllocationTracker(  StaticAllocator const &
-                    , void * arg_alloc_ptr
-                    , size_t arg_alloc_size
-                    , const std::string & arg_label = std::string("") )
-    : m_alloc_rec(0)
-  {
-    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
-    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
-  }
-
-  /// Create a AllocationTracker
-  ///
-  /// Start reference counting the alloc_ptr.
-  /// When the reference count reachs 0 the allocator deallocate method
-  /// will be call with the given size.  The alloc_ptr should have been
-  /// allocated with the allocator's allocate method.
-  ///
-  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
-  /// do nothing
-  template <typename StaticAllocator>
-  AllocationTracker(  StaticAllocator const &
-                    , size_t arg_alloc_size
-                    , const std::string & arg_label = std::string("")
-                   )
-    : m_alloc_rec(0)
-  {
-    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
-    void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
-
-    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
-  }
-
-  /// Copy an AllocatorTracker
-  KOKKOS_INLINE_FUNCTION
-  AllocationTracker( const AllocationTracker & rhs )
-    : m_alloc_rec( rhs.m_alloc_rec)
-  {
-#if !defined( __CUDA_ARCH__ )
-    if ( rhs.ref_counting() && tracking_enabled() ) {
-      increment_ref_count();
-    }
-    else {
-      m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
-    }
-#else
-    m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
-#endif
-  }
-
-  /// Copy an AllocatorTracker
-  /// Decrement the reference count of the current tracker if necessary
-  KOKKOS_INLINE_FUNCTION
-  AllocationTracker & operator=( const AllocationTracker & rhs )
-  {
-    if (this != &rhs) {
-#if !defined( __CUDA_ARCH__ )
-      if ( ref_counting() ) {
-        decrement_ref_count();
-      }
-
-      m_alloc_rec = rhs.m_alloc_rec;
-
-      if ( rhs.ref_counting() && tracking_enabled() ) {
-        increment_ref_count();
-      }
-      else {
-        m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
-      }
-#else
-      m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
-#endif
-    }
-
-    return * this;
-  }
-
-  /// Destructor
-  /// Decrement the reference count if necessary
-  KOKKOS_INLINE_FUNCTION
-  ~AllocationTracker()
-  {
-#if !defined( __CUDA_ARCH__ )
-    if ( ref_counting() ) {
-      decrement_ref_count();
-    }
-#endif
-  }
-
-  /// Is the tracker valid?
-  KOKKOS_INLINE_FUNCTION
-  bool is_valid() const
-  {
-    return (m_alloc_rec & REF_COUNT_MASK);
-  }
-
-
-
-  /// clear the tracker
-  KOKKOS_INLINE_FUNCTION
-  void clear()
-  {
-#if !defined( __CUDA_ARCH__ )
-    if ( ref_counting() ) {
-      decrement_ref_count();
-    }
-#endif
-    m_alloc_rec = 0;
-  }
-
-  /// is this tracker currently counting allocations?
-  KOKKOS_INLINE_FUNCTION
-  bool ref_counting() const
-  {
-    return (m_alloc_rec & REF_COUNT_BIT);
-  }
-
-  AllocatorBase * allocator() const;
-
-  /// pointer to the allocated memory
-  void * alloc_ptr()  const;
-
-  /// size in bytes of the allocated memory
-  size_t alloc_size() const;
-
-  /// the current reference count
-  size_t ref_count()  const;
-
-  /// the label given to the allocation
-  char const * label() const;
-
-  /// pretty print all the tracker's information to the std::ostream
-  void print( std::ostream & oss) const;
-
-
-  /// set an attribute ptr on the allocation record
-  /// the arg_attribute pointer will be deleted when the record is destroyed
-  /// the attribute ptr can only be set once
-  bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
-
-  /// get the attribute ptr from the allocation record
-  AllocatorAttributeBase * attribute() const;
-
-
-  /// reallocate the memory tracked by this allocation
-  /// NOT thread-safe
-  void reallocate( size_t size ) const;
-
-  static void disable_tracking();
-  static void enable_tracking();
-  static bool tracking_enabled();
-
-private:
-
-  static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
-
-  void initalize(  AllocatorBase * arg_allocator
-                 , void * arg_alloc_ptr
-                 , size_t arg_alloc_size
-                 , std::string const & label );
-
-  void increment_ref_count() const;
-  void decrement_ref_count() const;
-
-  friend struct Impl::MallocHelper;
-
-  uintptr_t m_alloc_rec;
-};
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-#endif //KOKKOS_ALLOCATION_TRACKER_HPP
-
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
new file mode 100644
index 0000000000..0246a7b9af
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -0,0 +1,197 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
+#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Concepts.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos { namespace Impl {
+
+template < typename ExecutionSpace   = void
+         , typename Schedule         = void
+         , typename WorkTag          = void
+         , typename IndexType        = void
+         , typename IterationPattern = void
+         >
+struct PolicyTraitsBase
+{
+  using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
+
+  using execution_space   = ExecutionSpace;
+  using schedule_type     = Schedule;
+  using work_tag          = WorkTag;
+  using index_type        = IndexType;
+  using iteration_pattern = IterationPattern;
+};
+
+
+template <typename PolicyBase, typename ExecutionSpace>
+struct SetExecutionSpace
+{
+  static_assert( is_void<typename PolicyBase::execution_space>::value
+               , "Kokkos Error: More than one execution space given" );
+  using type = PolicyTraitsBase< ExecutionSpace
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename Schedule>
+struct SetSchedule
+{
+  static_assert( is_void<typename PolicyBase::schedule_type>::value
+               , "Kokkos Error: More than one schedule type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , Schedule
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename WorkTag>
+struct SetWorkTag
+{
+  static_assert( is_void<typename PolicyBase::work_tag>::value
+               , "Kokkos Error: More than one work tag given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , WorkTag
+                               , typename PolicyBase::index_type
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+template <typename PolicyBase, typename IndexType>
+struct SetIndexType
+{
+  static_assert( is_void<typename PolicyBase::index_type>::value
+               , "Kokkos Error: More than one index type given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , IndexType
+                               , typename PolicyBase::iteration_pattern
+                               >;
+};
+
+
+template <typename PolicyBase, typename IterationPattern>
+struct SetIterationPattern
+{
+  static_assert( is_void<typename PolicyBase::iteration_pattern>::value
+               , "Kokkos Error: More than one iteration_pattern given" );
+  using type = PolicyTraitsBase< typename PolicyBase::execution_space
+                               , typename PolicyBase::schedule_type
+                               , typename PolicyBase::work_tag
+                               , typename PolicyBase::index_type
+                               , IterationPattern
+                               >;
+};
+
+
+template <typename Base, typename... Traits>
+struct AnalyzePolicy;
+
+template <typename Base, typename T, typename... Traits>
+struct AnalyzePolicy<Base, T, Traits...> : public
+  AnalyzePolicy<
+      typename std::conditional< is_execution_space<T>::value  , SetExecutionSpace<Base,T>
+    , typename std::conditional< is_schedule_type<T>::value    , SetSchedule<Base,T>
+    , typename std::conditional< is_index_type<T>::value       , SetIndexType<Base,T>
+    , typename std::conditional< std::is_integral<T>::value    , SetIndexType<Base, IndexType<T> >
+    , typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
+    , SetWorkTag<Base,T>
+    >::type >::type >::type >::type>::type::type
+  , Traits...
+  >
+{};
+
+template <typename Base>
+struct AnalyzePolicy<Base>
+{
+  using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
+                                                   , DefaultExecutionSpace
+                                                   , typename Base::execution_space
+                                                   >::type;
+
+  using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
+                                                 , Schedule< Static >
+                                                 , typename Base::schedule_type
+                                                 >::type;
+
+  using work_tag = typename Base::work_tag;
+
+  using index_type = typename std::conditional< is_void< typename Base::index_type >::value
+                                              , IndexType< typename execution_space::size_type >
+                                              , typename Base::index_type
+                                              >::type
+                                               ::type // nasty hack to make index_type into an integral_type
+                                              ;       // instead of the wrapped IndexType<T> for backwards compatibility
+
+  using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
+                                                     , void // TODO set default iteration pattern
+                                                     , typename Base::iteration_pattern
+                                                     >::type;
+  using type = PolicyTraitsBase< execution_space
+                               , schedule_type
+                               , work_tag
+                               , index_type
+                               , iteration_pattern
+                               >;
+};
+
+template <typename... Traits>
+struct PolicyTraits
+  : public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
+{};
+
+}} // namespace Kokkos::Impl
+
+
+#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index 8a27ce6f22..fd7ea845e7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
   while( !Impl::lock_address_host_space( (void*) dest ) );
   T return_val = *dest;
   if( return_val == compare ) {
-    const T tmp = *dest = val;
+    // Don't use the following line of code here:
+    //
+    //const T tmp = *dest = val;
+    //
+    // Instead, put each assignment in its own statement.  This is
+    // because the overload of T::operator= for volatile *this should
+    // return void, not volatile T&.  See Kokkos #177:
+    //
+    // https://github.com/kokkos/kokkos/issues/177
+    *dest = val;
+    const T tmp = *dest;
     #ifndef KOKKOS_COMPILER_CLANG
     (void) tmp;
     #endif
@@ -239,7 +249,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val
   {
     retval = dest[0];
     if ( retval == compare )
-  	dest[0] = val;
+        dest[0] = val;
   }
   return retval;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index 8990604674..e8cac4ba3b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest ,
 {
   while( !Impl::lock_address_host_space( (void*) dest ) );
   T return_val = *dest;
-  const T tmp = *dest = val;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = val;
+  const T tmp = *dest;
   #ifndef KOKKOS_COMPILER_CLANG
   (void) tmp;
   #endif
@@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest ,
   // member.  The volatile return value implicitly defines a
   // dereference that some compilers (gcc 4.7.2) warn is being ignored.
   // Suppress warning by casting return to void.
-  (void)( *dest = val );
+  //(void)( *dest = val );
+  *dest = val;
+
   Impl::unlock_address_host_space( (void*) dest );
 }
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 239bbf7cbb..62dfcdd2f8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest ,
     assume.i = oldval.i ;
     newval.t = assume.t + val ;
     oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
+  } while ( assume.i != oldval.i );
 
   return oldval.t ;
 }
@@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest ,
 
 #elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
 
+#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * dest , const int val )
+{
+        int original = val;
+
+        __asm__ __volatile__(
+                "lock xadd %1, %0"
+                : "+m" (*dest), "+r" (original)
+                : "m" (*dest), "r" (original)
+                : "memory"
+        );
+
+        return original;
+}
+#else
 KOKKOS_INLINE_FUNCTION
 int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest,val); }
+{ return __sync_fetch_and_add(dest, val); }
+#endif
 
 KOKKOS_INLINE_FUNCTION
 long int atomic_fetch_add( volatile long int * const dest , const long int val )
@@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest ,
 {
   while( !Impl::lock_address_host_space( (void*) dest ) );
   T return_val = *dest;
-  const T tmp = *dest = return_val + val;
+  // Don't use the following line of code here:
+  //
+  //const T tmp = *dest = return_val + val;
+  //
+  // Instead, put each assignment in its own statement.  This is
+  // because the overload of T::operator= for volatile *this should
+  // return void, not volatile T&.  See Kokkos #177:
+  //
+  // https://github.com/kokkos/kokkos/issues/177
+  *dest = return_val + val;
+  const T tmp = *dest;
   (void) tmp;
   Impl::unlock_address_host_space( (void*) dest );
   return return_val;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index 647b3ad4e1..a3a57aa81c 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest ,
     assume.i = oldval.i ;
     newval.t = assume.t - val ;
     oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
-  } while ( assumed.i != oldval.i );
+  } while ( assume.i != oldval.i );
 
   return oldval.t ;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index 76f3ccac73..343e9bf4c4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -48,6 +48,22 @@
 namespace Kokkos {
 namespace Impl {
 
+template<class Scalar1, class Scalar2>
+struct MaxOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MinOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+};
+
 template<class Scalar1, class Scalar2>
 struct AddOper {
   KOKKOS_FORCEINLINE_FUNCTION
@@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
 namespace Kokkos {
 
 // Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
+}
+
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_fetch_mul(volatile T * const dest, const T val) {
@@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
 
 
 // Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
+}
+
 template < typename T >
 KOKKOS_INLINE_FUNCTION
 T atomic_mul_fetch(volatile T * const dest, const T val) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 4a2a408273..6e48faa694 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
   typedef int64_t type;
 };
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-class AllocationTracker;
-
-// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
-template<class ViewTraits>
-class ViewDataHandle<
-  ViewTraits ,
-  typename enable_if<
-    ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
-    ( ViewTraits::memory_traits::Atomic )
-  >::type >
-{
-private:
-//  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
-//                        (sizeof(typename ViewTraits::const_value_type)==8),
-//                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
-//                   atomic_view_possible;
-  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
-  typedef ViewDataHandle self_type;
-
-public:
-  enum {  ReturnTypeIsReference = false };
-
-  typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
-  typedef Impl::AtomicDataElement<ViewTraits>    return_type;
-
-  KOKKOS_INLINE_FUNCTION
-  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
-  {
-    return handle_type(arg_data_ptr);
-  }
-};
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 }} // namespace Kokkos::Impl
 
 #endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
deleted file mode 100644
index 7cf233c689..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_HostSpace.hpp>
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-#include <impl/Kokkos_BasicAllocators.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-
-#include <stdint.h>    // uintptr_t
-#include <cstdlib>     // for malloc, realloc, and free
-#include <cstring>     // for memcpy
-
-#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
-#include <sys/mman.h>  // for mmap, munmap, MAP_ANON, etc
-#include <unistd.h>    // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
-#endif
-
-#include <sstream>
-
-namespace Kokkos { namespace Impl {
-
-/*--------------------------------------------------------------------------*/
-
-void* MallocAllocator::allocate( size_t size )
-{
-  void * ptr = NULL;
-  if (size) {
-    ptr = malloc(size);
-
-    if (!ptr)
-    {
-      std::ostringstream msg ;
-      msg << name() << ": allocate(" << size << ") FAILED";
-      throw_runtime_exception( msg.str() );
-    }
-  }
-  return ptr;
-}
-
-void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
-{
-  if (ptr) {
-    free(ptr);
-  }
-}
-
-void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
-{
-  void * ptr = realloc(old_ptr, new_size);
-
-  if (new_size > 0u && ptr == NULL) {
-    throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
-  }
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-namespace {
-
-void * raw_aligned_allocate( size_t size, size_t alignment )
-{
-  void * ptr = NULL;
-  if ( size ) {
-#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
-    ptr = _mm_malloc( size , alignment );
-
-#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
-
-    posix_memalign( & ptr, alignment , size );
-
-#else
-    // Over-allocate to and round up to guarantee proper alignment.
-    size_t size_padded = size + alignment + sizeof(void *);
-    void * alloc_ptr = malloc( size_padded );
-
-    if (alloc_ptr) {
-      uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
-      // offset enough to record the alloc_ptr
-      address += sizeof(void *);
-      uintptr_t rem = address % alignment;
-      uintptr_t offset = rem ? (alignment - rem) : 0u;
-      address += offset;
-      ptr = reinterpret_cast<void *>(address);
-      // record the alloc'd pointer
-      address -= sizeof(void *);
-      *reinterpret_cast<void **>(address) = alloc_ptr;
-    }
-#endif
-  }
-  return ptr;
-}
-
-void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
-{
-  if ( ptr ) {
-#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
-    _mm_free( ptr );
-
-#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
-    free( ptr );
-#else
-    // get the alloc'd pointer
-    void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
-    free( alloc_ptr );
-#endif
-  }
-
-}
-
-}
-
-void* AlignedAllocator::allocate( size_t size )
-{
-  void * ptr = 0 ;
-
-  if ( size ) {
-    ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
-
-    if (!ptr)
-    {
-      std::ostringstream msg ;
-      msg << name() << ": allocate(" << size << ") FAILED";
-      throw_runtime_exception( msg.str() );
-    }
-  }
-  return ptr;
-}
-
-void AlignedAllocator::deallocate( void * ptr, size_t size )
-{
-  raw_aligned_deallocate( ptr, size);
-}
-
-void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = old_ptr;;
-
-  if (old_size < new_size) {
-    ptr = allocate( new_size );
-
-    memcpy(ptr, old_ptr, old_size );
-
-    deallocate( old_ptr, old_size );
-  }
-
-  return ptr;
-}
-
-/*--------------------------------------------------------------------------*/
-
-// mmap flags for private anonymous memory allocation
-#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
-  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
-#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
-  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
-#else
-  #define NO_MMAP
-#endif
-
-// huge page tables
-#if !defined( NO_MMAP )
-  #if defined( MAP_HUGETLB )
-    #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
-  #elif defined( MMAP_FLAGS )
-    #define MMAP_FLAGS_HUGE MMAP_FLAGS
-  #endif
-  // threshold to use huge pages
-  #define MMAP_USE_HUGE_PAGES (1u << 27)
-#endif
-
-// read write access to private memory
-#if !defined( NO_MMAP )
-  #define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
-#endif
-
-
-void* PageAlignedAllocator::allocate( size_t size )
-{
-  void *ptr = NULL;
-  if (size) {
-#if !defined NO_MMAP
-    if ( size < MMAP_USE_HUGE_PAGES ) {
-      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
-    } else {
-      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
-    }
-    if (ptr == MAP_FAILED) {
-      ptr = NULL;
-    }
-#else
-    static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
-
-    ptr = raw_aligned_allocate( size, page_size);
-#endif
-    if (!ptr)
-    {
-      std::ostringstream msg ;
-      msg << name() << ": allocate(" << size << ") FAILED";
-      throw_runtime_exception( msg.str() );
-    }
-  }
-  return ptr;
-}
-
-void PageAlignedAllocator::deallocate( void * ptr, size_t size )
-{
-#if !defined( NO_MMAP )
-  munmap(ptr, size);
-#else
-  raw_aligned_deallocate(ptr, size);
-#endif
-}
-
-void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
-{
-  void * ptr = NULL;
-#if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ )
-
-  if (old_size != new_size) {
-    ptr = allocate( new_size );
-
-    memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
-
-    deallocate( old_ptr, old_size );
-  }
-  else {
-    ptr = old_ptr;
-  }
-#else
-  ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
-
-  if (ptr == MAP_FAILED) {
-    throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
-  }
-#endif
-
-  return ptr;
-}
-
-}} // namespace Kokkos::Impl
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
new file mode 100644
index 0000000000..0ffbc0548a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -0,0 +1,122 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITOPS_HPP
+#define KOKKOS_BITOPS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+#include <climits>
+
+namespace Kokkos {
+namespace Impl {
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward( unsigned i )
+{
+#if defined( __CUDA_ARCH__ )
+  return __ffs(i) - 1;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_forward(i);
+#else
+
+  unsigned t = 1u;
+  int r = 0;
+  while ( i && ( i & t == 0 ) )
+  {
+    t = t << 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_reverse( unsigned i )
+{
+  enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
+#if defined( __CUDA_ARCH__ )
+  return shift - __clz(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return shift - __builtin_clz(i);
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_reverse(i);
+#else
+  unsigned t = 1u << shift;
+  int r = 0;
+  while ( i && ( i & t == 0 ) )
+  {
+    t = t >> 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+/// Count the number of bits set.
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_count( unsigned i )
+{
+#if defined( __CUDA_ARCH__ )
+  return __popc(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
+#elif defined ( __INTEL_COMPILER )
+  return _popcnt32(i);
+#else
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
+  i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
+  i = ( i & ~0u / 15u * 3u ) + ( ( i >> 2 ) & ~0u / 15u * 3u );  // temp
+  i = ( i + ( i >> 4 ) ) & ~0u / 255u * 15u;                     // temp
+
+  // count
+  return (int)( ( i * ( ~0u / 255u ) ) >> ( sizeof(unsigned) - 1 ) * CHAR_BIT );
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif // KOKKOS_BITOPS_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index 94db15d96f..567a214140 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -148,7 +148,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 #endif
 
 #if (KOKKOS_ENABLE_PROFILING)
-    Kokkos::Experimental::initialize();
+    Kokkos::Profiling::initialize();
 #endif
 }
 
@@ -190,7 +190,7 @@ void finalize_internal( const bool all_spaces = false )
 #endif
 
 #if (KOKKOS_ENABLE_PROFILING)
-    Kokkos::Experimental::finalize();
+    Kokkos::Profiling::finalize();
 #endif
 
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index 43a1b2afbd..78b6794491 100644
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -75,6 +75,7 @@ struct FunctorValueTraits
   typedef void value_type ;
   typedef void pointer_type ;
   typedef void reference_type ;
+  typedef void functor_type ;
 
   enum { StaticValueSize = 0 };
 
@@ -88,7 +89,10 @@ struct FunctorValueTraits
 template<class ArgTag>
 struct FunctorValueTraits<void, ArgTag,false>
 {
-  typedef void reference_type;
+  typedef void value_type ;
+  typedef void pointer_type ;
+  typedef void reference_type ;
+  typedef void functor_type ;
 };
 
 /** \brief  FunctorType::value_type is explicitly declared so use it.
@@ -106,6 +110,7 @@ template< class FunctorType , class ArgTag >
 struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType::value_type */ >
 {
   typedef typename Impl::remove_extent< typename FunctorType::value_type >::type  value_type ;
+  typedef FunctorType functor_type;
 
   static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
     "Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
@@ -342,6 +347,7 @@ public:
   typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType   >::type  value_type ;
   typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType * >::type  pointer_type ;
   typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType & >::type  reference_type ;
+  typedef FunctorType functor_type;
 
   static_assert( IS_VOID || IS_REJECT || 0 == ( sizeof(ValueType) % sizeof(int) ) ,
     "Reduction functor's value_type deduced from functor::operator() requires: 0 == sizeof(value_type) % sizeof(int)" );
@@ -568,24 +574,56 @@ struct FunctorValueJoin ;
 template< class FunctorType , class ArgTag , class T , class Enable >
 struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable >
 {
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& ){}
+
   KOKKOS_FORCEINLINE_FUNCTION static
   void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
     {
       *((volatile T*)lhs) += *((const volatile T*)rhs);
     }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      lhs += rhs;
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      lhs += rhs;
+    }
 };
 
 /* No 'join' function provided, array of values */
 template< class FunctorType , class ArgTag , class T , class Enable >
 struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable >
 {
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
   KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
     {
-      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f_);
 
       for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; }
     }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; }
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { lhs[i] += rhs[i]; }
+    }
 };
 
 /* 'join' function provided, single value */
@@ -599,10 +637,25 @@ struct FunctorValueJoin
   , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
   >
 {
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
   KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      f.join( ArgTag() , lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
     {
-      f.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) );
+      f.join( ArgTag(), lhs , rhs );
     }
 };
 
@@ -617,10 +670,25 @@ struct FunctorValueJoin
   , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
   >
 {
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
   KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
     {
-      f.join( *((volatile T *)lhs) , *((const volatile T *)rhs) );
+      f_.join( *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T& lhs , const volatile T& rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T& lhs , const T& rhs ) const
+    {
+      f.join( lhs , rhs );
     }
 };
 
@@ -635,10 +703,25 @@ struct FunctorValueJoin
   , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
   >
 {
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
   KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( volatile T* const lhs , const volatile T* const rhs ) const
     {
-      f.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs );
+      f.join( ArgTag() , lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
+    {
+      f.join( ArgTag(), lhs , rhs );
     }
 };
 
@@ -653,10 +736,25 @@ struct FunctorValueJoin
   , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
   >
 {
+  const FunctorType& f;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  FunctorValueJoin(const FunctorType& f_):f(f_){}
+
   KOKKOS_FORCEINLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+  void join( const FunctorType & f_ , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f_.join( (volatile T *)lhs , (const volatile T *)rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( volatile T* const lhs , const volatile T* const rhs ) const
+    {
+      f.join( lhs , rhs );
+    }
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator() ( T* lhs , const T* rhs ) const
     {
-      f.join( (volatile T *)lhs , (const volatile T *)rhs );
+      f.join( lhs , rhs );
     }
 };
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index 20956ce593..11cc120212 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -56,7 +56,6 @@
 #include <algorithm>
 
 #include <Kokkos_HBWSpace.hpp>
-#include <impl/Kokkos_BasicAllocators.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
 #ifdef KOKKOS_HAVE_HBWSPACE
@@ -126,23 +125,6 @@ int HBWSpace::in_parallel()
 
 /*--------------------------------------------------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace Kokkos {
-namespace Experimental {
-
-Kokkos::Impl::AllocationTracker HBWSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Kokkos::Impl::AllocationTracker( allocator(), size, label );
-}
-
-} // namespace Experimental
-} // namespace Kokkos
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-/*--------------------------------------------------------------------------*/
-
 namespace Kokkos {
 namespace Experimental {
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index 9dc774cdeb..b52f4591ef 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -93,7 +93,6 @@
 #include <cstring>
 
 #include <Kokkos_HostSpace.hpp>
-#include <impl/Kokkos_BasicAllocators.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Kokkos_Atomic.hpp>
 
@@ -156,21 +155,6 @@ int HostSpace::in_parallel()
 
 /*--------------------------------------------------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace Kokkos {
-
-Impl::AllocationTracker HostSpace::allocate_and_track( const std::string & label, const size_t size )
-{
-  return Impl::AllocationTracker( allocator(), size, label );
-}
-
-} // namespace Kokkos
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-/*--------------------------------------------------------------------------*/
-
 namespace Kokkos {
 
 /* Default allocation mechanism */
diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp b/lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp
deleted file mode 100644
index bb858d8d9e..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_MemoryPool_Inline.hpp
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MEMORYPOOL_CPP
-#define KOKKOS_MEMORYPOOL_CPP
-
-// How should errors be handled?  In general, production code should return a
-// value indicating failure so the user can decide how the error is handled.
-// While experimental, code can abort instead.  If KOKKOS_MEMPOOLLIST_PRINTERR
-// is defined, the code will abort with an error message.  Otherwise, the code
-// will return with a value indicating failure when possible, or do nothing
-// instead.
-//#define KOKKOS_MEMPOOLLIST_PRINTERR
-
-//#define KOKKOS_MEMPOOLLIST_PRINT_INFO
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-
-/* This '.cpp' is being included by the header file
- * to inline these functions for Cuda.
- *
- *  Prefer to implement these functions in a separate
- *  compilation unit.  However, the 'nvcc' linker
- *  has an internal error when attempting separate compilation
- *  (--relocatable-device-code=true)
- *  of Kokkos unit tests.
- */
-
-#define KOKKOS_MEMPOOLLIST_INLINE inline
-
-#else
-
-/*  This '.cpp' file is being separately compiled for the Host */
-
-#include <Kokkos_MemoryPool.hpp>
-#include <Kokkos_Atomic.hpp>
-
-#define KOKKOS_MEMPOOLLIST_INLINE /* */
-
-#endif
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Experimental {
-namespace Impl {
-
-#if defined(KOKKOS_MEMPOOLLIST_PRINT_INFO) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
-long MemPoolList::m_count = 0;
-#endif
-
-KOKKOS_FUNCTION
-KOKKOS_MEMPOOLLIST_INLINE
-uint64_t
-MemPoolList::acquire_lock( volatile uint64_t * freelist ) const
-{
-  uint64_t old_head;
-  bool locked = false;
-
-  while ( !locked ) {
-    old_head = *freelist;
-
-    if ( old_head != FREELIST_LOCK_HEAD ) {
-      // In the initial look at the head, the freelist wasn't locked.
-      // Attempt to lock the head of list.  If the list was changed (including
-      // being locked) between the initial look and now, head will be different
-      // than old_head.  This means the lock can't proceed and has to be
-      // tried again.
-      uint64_t head =
-        atomic_compare_exchange( freelist, old_head, uint64_t(FREELIST_LOCK_HEAD) );
-
-      if ( head == old_head ) locked = true;
-    }
-  }
-
-  return old_head;
-}
-
-KOKKOS_FUNCTION
-KOKKOS_MEMPOOLLIST_INLINE
-void
-MemPoolList::release_lock( volatile uint64_t * freelist, uint64_t new_head ) const
-{
-  // This function is only intended to be called if acquire_lock() has already
-  // been called to acquire a lock on freelist.  Thus, we know that the value
-  // pointed to by freelist is FREELIST_LOCK_HEAD.
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-  uint64_t head =
-#endif
-    atomic_compare_exchange( freelist, uint64_t(FREELIST_LOCK_HEAD), new_head );
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-  if ( head != FREELIST_LOCK_HEAD ) {
-    // We shouldn't get here, but this check is here for sanity.
-    printf( "\n** MemoryPool::allocate() UNLOCK_ERROR(0x%llx) **\n",
-            reinterpret_cast<uint64_t>( freelist ) );
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    fflush( stdout );
-#endif
-    Kokkos::abort( "" );
-  }
-#endif
-}
-
-KOKKOS_FUNCTION
-KOKKOS_MEMPOOLLIST_INLINE
-void *
-MemPoolList::refill_freelist( size_t l_exp ) const
-{
-  void * p = 0;
-  volatile uint64_t * l_exp_freelist = m_freelist + l_exp;
-
-  // The l_exp freelist is empty. Grab a lock on the freelist.
-  uint64_t l_exp_old_head = acquire_lock( l_exp_freelist );
-
-  uint64_t l_exp_old_head_off = get_head_offset( l_exp_old_head );
-
-  if ( l_exp_old_head_off != FREELIST_END ) {
-    // Another thread put some more entries on the freelist between when
-    // this thread saw it empty and acquired the lock.  Just return an entry.
-    uint64_t l_exp_old_head_tag = get_head_tag( l_exp_old_head );
-    uint64_t new_head_tag = increment_tag( l_exp_old_head_tag );
-    uint64_t new_head_off = *reinterpret_cast<uint64_t *>( m_data + l_exp_old_head_off );
-    uint64_t new_head = create_head( new_head_off, new_head_tag );
-
-    // Release the lock, replacing the head with the next entry on the list.
-    release_lock( l_exp_freelist, new_head );
-
-    // Set the chunk to return.
-    p = m_data + l_exp_old_head_off;
-  }
-  else {
-    // The l_exp freelist is empty.
-
-    size_t l = l_exp + 1;
-    bool done = false;
-
-    while ( !done ) {
-      // Find the next freelist that is either locked or not empty.  A locked
-      // freelist will probably have memory available when the lock is
-      // released.
-      while ( m_chunk_size[l] > 0 &&
-              get_head_offset( m_freelist[l] ) == FREELIST_END ) ++l;
-
-      if ( m_chunk_size[l] == 0 ) {
-        // We got to the end of the list of freelists without finding any
-        // available memory which means the pool is empty.  Release the lock
-        // on the l_exp freelist.
-        release_lock( l_exp_freelist, l_exp_old_head );
-
-        // Exit out of the loop.
-        done = true;
-      }
-      else {
-        volatile uint64_t * l_freelist = m_freelist + l;
-
-        // Grab a lock on the l freelist.
-        uint64_t l_old_head = acquire_lock( l_freelist );
-        uint64_t l_old_head_off = get_head_offset( l_old_head );
-
-        if ( l_old_head_off != FREELIST_END ) {
-          // The l freelist has chunks.  Grab one to divide.
-
-          // Create a new head for the l_freelist by using the second entry
-          // in the list and incrementing the current tag.
-          uint64_t l_old_head_tag = get_head_tag( l_old_head );
-          uint64_t new_head_tag = increment_tag( l_old_head_tag );
-          uint64_t new_head_off =
-            *reinterpret_cast<volatile uint64_t *>( m_data + l_old_head_off );
-          uint64_t new_head = create_head( new_head_off, new_head_tag );
-
-          // Release the lock on the l freelist.
-          release_lock( l_freelist, new_head );
-
-          // Subdivide the chunk into smaller chunks.  The first chunk will
-          // be returned to satisfy the allocaiton request.  The remainder
-          // of the chunks will be inserted onto the appropriate freelist.
-          size_t num_chunks = m_chunk_size[l] / m_chunk_size[l_exp];
-
-          // Link the chunks following the first chunk to form a list.
-          uint64_t lp_head = l_old_head_off + m_chunk_size[l_exp];
-          uint64_t lp_tail = l_old_head_off + (num_chunks - 1) * m_chunk_size[l_exp];
-
-          for ( uint64_t offset = lp_head; offset < lp_tail;
-                offset += m_chunk_size[l_exp] )
-          {
-            *reinterpret_cast<uint64_t *>( m_data + offset ) =
-              offset + m_chunk_size[l_exp];
-          }
-
-          // Set the tail entry to be the end of the list.
-          *reinterpret_cast<volatile uint64_t *>( m_data + lp_tail ) = FREELIST_END;
-
-          memory_fence();
-
-          // Create a new head for the l_exp_freelist.
-          new_head = create_head( lp_head, get_head_tag( l_exp_old_head ) );
-
-          // This thread already has the lock on the l_exp freelist, so just
-          // release the lock placing the divided memory on the list.
-          release_lock( l_exp_freelist, new_head );
-
-          // Set the chunk to return.
-          p = m_data + l_old_head_off;
-          done = true;
-        }
-        else {
-          // Release the lock on the l freelist.  Put the old head back on.
-          release_lock( l_freelist, l_old_head );
-        }
-      }
-    }
-  }
-
-  return p;
-}
-
-KOKKOS_FUNCTION
-KOKKOS_MEMPOOLLIST_INLINE
-void *
-MemPoolList::allocate( size_t alloc_size ) const
-{
-  void * p = 0;
-
-  // Find the first freelist whose chunk size is big enough for allocation.
-  size_t l_exp = 0;
-  while ( m_chunk_size[l_exp] > 0 && alloc_size > m_chunk_size[l_exp] ) ++l_exp;
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-  if ( m_chunk_size[l_exp] == 0 ) {
-    Kokkos::abort( "\n** MemoryPool::allocate() REQUESTED_SIZE_TOO_LARGE **\n" );
-  }
-#endif
-
-  // Do a fast fail test for an empty list.  This checks for l_exp and all
-  // higher freelists being empty.
-  size_t l = l_exp;
-  while ( m_chunk_size[l] > 0 &&
-          get_head_offset( m_freelist[l] ) == FREELIST_END ) ++l;
-
-  if ( m_chunk_size[l] != 0 ) {
-    // Try to grab a chunk from the l_exp list.
-    volatile uint64_t * l_exp_freelist = m_freelist + l_exp;
-
-    bool done = false;
-
-    while ( !done ) {
-      uint64_t old_head = *l_exp_freelist;
-      uint64_t old_head_off = get_head_offset( old_head );
-
-      if ( old_head_off == FREELIST_END ) {
-        // The list is empty.  Try to refill it and grab a chunk.
-        p = refill_freelist(l_exp);
-
-        done = true;
-      }
-      else if ( old_head_off != FREELIST_LOCK ) {
-        // The freelist wasn't empty or locked, so try to pop off the head.
-        uint64_t old_head_tag = get_head_tag( old_head );
-
-        // Increment the tag by 1, wrapping around to 0 after 2^32-1.
-        uint64_t new_head_tag = increment_tag( old_head_tag );
-        uint64_t new_head_off = *reinterpret_cast<uint64_t *>( m_data + old_head_off );
-        uint64_t new_head = create_head( new_head_off, new_head_tag );
-
-        // Attempt to pull off the head of the list and put the next entry in
-        // its place.  If the list was changed
-        // (including being locked) between the initial look and now, head will
-        // be different than old_head.  This means the insert can't proceed and
-        // has to be tried again.
-        uint64_t head = atomic_compare_exchange( l_exp_freelist, old_head, new_head );
-
-        if ( head == old_head ) {
-          done = true;
-          p = m_data + old_head_off;
-        }
-      }
-    }
-  }
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINT_INFO
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-  long val = p == 0 ?
-             *reinterpret_cast<volatile long *>( &m_count ) :
-             Kokkos::atomic_fetch_add( &m_count, 1 );
-
-  printf( "  allocate(): %6ld   size: %6lu    l: %2lu  %2lu   0x%llx\n", val,
-          alloc_size, l_exp, l, reinterpret_cast<uint64_t>( p ) );
-  fflush( stdout );
-#else
-  printf( "  allocate()   size: %6lu    l: %2lu  %2lu   0x%lx\n", alloc_size,
-          l_exp, l, reinterpret_cast<uint64_t>( p ) );
-#endif
-#endif
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-  if ( p == 0 ) {
-    printf( "** MemoryPool::allocate() NO_CHUNKS_BIG_ENOUGH **\n" );
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    fflush( stdout );
-#endif
-  }
-#endif
-
-  return p;
-}
-
-KOKKOS_FUNCTION
-KOKKOS_MEMPOOLLIST_INLINE
-void
-MemPoolList::deallocate( void * alloc_ptr, size_t alloc_size ) const
-{
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-  // Verify that the pointer is controlled by this pool.
-  {
-    char * ap = static_cast<char *>( alloc_ptr );
-
-    if ( ap < m_data || ap + alloc_size > m_data + m_data_size ) {
-      printf( "\n** MemoryPool::deallocate() ADDRESS_OUT_OF_RANGE(0x%llx) **\n",
-              reinterpret_cast<uint64_t>( alloc_ptr ) );
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-      fflush( stdout );
-#endif
-      Kokkos::abort( "" );
-    }
-  }
-#endif
-
-  // Determine which freelist to place deallocated memory on.
-  size_t l = 0;
-  while ( m_chunk_size[l] > 0 && alloc_size > m_chunk_size[l] ) ++l;
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-  if ( m_chunk_size[l] == 0 ) {
-    printf( "\n** MemoryPool::deallocate() CHUNK_TOO_LARGE(%lu) **\n", alloc_size );
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-    fflush( stdout );
-#endif
-    Kokkos::abort( "" );
-  }
-#endif
-
-  uint64_t offset = static_cast<char *>( alloc_ptr ) - m_data;
-
-  // Insert a single chunk at the head of the freelist.
-  volatile uint64_t * freelist = m_freelist + l;
-
-  bool inserted = false;
-
-  while ( !inserted ) {
-    uint64_t old_head = *freelist;
-
-    if ( old_head != FREELIST_LOCK_HEAD ) {
-      // In the initial look at the head, the freelist wasn't locked.
-
-      uint64_t old_head_off = get_head_offset(old_head);
-      uint64_t old_head_tag = get_head_tag(old_head);
-      uint64_t new_head = create_head( offset, old_head_tag );
-
-      // Proactively point the new head to the old head assuming a successful
-      // insertion into the list.
-      *reinterpret_cast<volatile uint64_t *>( alloc_ptr ) = old_head_off;
-
-      memory_fence();
-
-      // Attempt to insert at head of list.  If the list was changed
-      // (including being locked) between the initial look and now, head will
-      // be different than old_head.  This means the insert can't proceed and
-      // has to be tried again.
-      uint64_t head = atomic_compare_exchange( freelist, old_head, new_head );
-
-      if ( head == old_head ) inserted = true;
-    }
-  }
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINT_INFO
-#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
-  long val = Kokkos::atomic_fetch_add( &m_count, -1 ) - 1;
-  printf( "deallocate(): %6ld   size: %6lu    l: %2lu       0x%llx\n", val,
-          alloc_size, l, reinterpret_cast<uint64_t>( alloc_ptr ) );
-  fflush( stdout );
-#else
-  printf( "deallocate()   size: %6lu    l: %2lu       0x%lx\n", alloc_size, l,
-          reinterpret_cast<uint64_t>( alloc_ptr ) );
-#endif
-#endif
-}
-
-
-} // namespace Impl
-} // namespace Experimental
-} // namespace Kokkos
-
-#undef KOKKOS_MEMPOOLLIST_INLINE
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINTERR
-#undef KOKKOS_MEMPOOLLIST_PRINTERR
-#endif
-
-#ifdef KOKKOS_MEMPOOLLIST_PRINT_INFO
-#undef KOKKOS_MEMPOOLLIST_PRINT_INFO
-#endif
-
-#endif /* #ifndef KOKKOS_MEMORYPOOL_CPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
index 0e87c63e44..556c96d863 100644
--- a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@@ -58,7 +58,7 @@ struct PhysicalLayout {
   long long int stride[8]; //distance between two neighboring elements in a given dimension
 
   template< class T , class L , class D , class M >
-  PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
+  PhysicalLayout( const View<T,L,D,M> & view )
     : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
                    is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
     , rank( view.Rank )
@@ -66,17 +66,6 @@ struct PhysicalLayout {
       for(int i=0;i<8;i++) stride[i] = 0;
       view.stride( stride );
     }
-  #ifdef KOKKOS_HAVE_CUDA
-  template< class T , class L , class D , class M >
-  PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
-    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
-                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
-    , rank( view.Rank )
-    {
-      for(int i=0;i<8;i++) stride[i] = 0;
-      view.stride( stride );
-    }
-  #endif
 };
 
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
index 5da60841d4..8ea1e816cd 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
@@ -45,7 +45,7 @@
 #define KOKKOSP_DEVICE_INFO_HPP
 
 namespace Kokkos {
-namespace Experimental {
+namespace Profiling {
 
     struct KokkosPDeviceInfo {
         uint32_t deviceID;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
index f499cc63a7..91faed170a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -47,7 +47,7 @@
 #include <string.h>
 
 namespace Kokkos {
-  namespace Experimental {
+  namespace Profiling {
     bool profileLibraryLoaded() {
        	return (NULL != initProfileLibrary);
     }
@@ -95,6 +95,12 @@ namespace Kokkos {
     }
     
     void initialize() {
+
+        // Make sure initialize calls happens only once
+        static int is_initialized = 0;
+        if(is_initialized) return;
+        is_initialized = 1;
+
         void* firstProfileLibrary;
 
         char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
@@ -153,6 +159,11 @@ namespace Kokkos {
     }
 
     void finalize() {
+      // Make sure finalize calls happens only once
+      static int is_finalized = 0;
+      if(is_finalized) return;
+      is_finalized = 1;
+
       if(NULL != finalizeProfileLibrary) {
         (*finalizeProfileLibrary)();
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 919c4f619e..4f01256335 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -60,7 +60,7 @@
 
 #if (KOKKOS_ENABLE_PROFILING)
 namespace Kokkos {
-  namespace Experimental {
+  namespace Profiling {
 
     typedef void (*initFunction)(const int,
 	const uint64_t,
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
new file mode 100644
index 0000000000..e8bdbde6c6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Serial > ;
+
+void TaskQueueSpecialization< Kokkos::Serial >::execute
+  ( TaskQueue< Kokkos::Serial > * const queue )
+{
+  using execution_space = Kokkos::Serial ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member exec ;
+
+  // Loop until all queues are empty
+  while ( 0 < queue->m_ready_count ) {
+
+    task_root_type * task = end ;
+
+    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+        task = queue_type::pop_task( & queue->m_ready[i][j] );
+      }
+    }
+
+    if ( end != task ) {
+
+      // pop_task resulted in lock == task->m_next
+      // In the executing state
+
+      (*task->m_apply)( task , & exec );
+
+#if 0
+  printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+      // If a respawn then re-enqueue otherwise the task is complete
+      // and all tasks waiting on this task are updated.
+      queue->complete( task );
+    }
+    else if ( 0 != queue->m_ready_count ) {
+      Kokkos::abort("TaskQueue<Serial>::execute ERROR: ready_count");
+    }
+  }
+}
+
+void TaskQueueSpecialization< Kokkos::Serial > ::
+  iff_single_thread_recursive_execute(
+    TaskQueue< Kokkos::Serial > * const queue )
+{
+  using execution_space = Kokkos::Serial ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  Member exec ;
+
+  // Loop until no runnable task
+
+  task_root_type * task = end ;
+  
+  do {
+
+    task = end ;
+
+    for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+      for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+        task = queue_type::pop_task( & queue->m_ready[i][j] );
+      }
+    }
+
+    if ( end == task ) break ;
+
+    (*task->m_apply)( task , & exec );
+
+    queue->complete( task );
+
+  } while(1);
+}
+
+}} /* namespace Kokkos::Impl */
+
+#endif /* #if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
new file mode 100644
index 0000000000..48a110c5f1
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -0,0 +1,271 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_SERIAL_TASK_HPP
+#define KOKKOS_IMPL_SERIAL_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskQueueSpecialization< Kokkos::Serial >
+{
+public:
+
+  using execution_space = Kokkos::Serial ;
+  using memory_space    = Kokkos::HostSpace ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  static
+  void execute( queue_type * const );
+
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< Kokkos::Serial
+                               , typename FunctorType::value_type
+                               , FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Serial > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Serial >
+{
+public:
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+};
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
+{
+  typedef iType index_type;
+  const iType start ;
+  const iType end ;
+  enum {increment = 1};
+  //const  TaskExec< Kokkos::Serial > & thread;
+  TaskExec< Kokkos::Serial > & thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct
+    //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
+    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
+    : start(0)
+    , end(arg_count)
+    , thread(arg_thread)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct
+    //( const TaskExec< Kokkos::Serial > & arg_thread
+    ( TaskExec< Kokkos::Serial > & arg_thread
+    , const iType& arg_start
+    , const iType & arg_end
+    )
+    : start( arg_start )
+    , end(   arg_end)
+    , thread( arg_thread )
+    {}
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+/*
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
+TeamThreadRange( const Impl::TaskExec< Kokkos::Serial > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
+}
+*/
+//TODO const issue omp
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
+TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread
+               , const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
+}
+/*
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >
+TeamThreadRange( const Impl:: TaskExec< Kokkos::Serial > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >(thread,start,end);
+}
+*/
+//TODO const issue omp
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >
+TeamThreadRange( Impl:: TaskExec< Kokkos::Serial > & thread, const iType & start , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >(thread,start,end);
+}
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i, result);
+
+  initialized_result = result;
+}
+
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+  ValueType result = initialized_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i, result);
+
+  initialized_result = result;
+}
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& initialized_result)
+{
+}
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& initialized_result)
+{
+}
+
+template< typename ValueType, typename iType, class Lambda >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda)
+{
+  ValueType accum = 0 ;
+  ValueType val, local_total;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    local_total = 0;
+    lambda(i,local_total,false);
+    val = accum;
+    lambda(i,val,true);
+    accum += local_total;
+  }
+
+}
+
+// placeholder for future function
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
+   const Lambda & lambda)
+{
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
index 5f3e65b327..1577df07cd 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
@@ -45,7 +45,8 @@
 
 #include <impl/Kokkos_Serial_TaskPolicy.hpp>
 
-#if defined( KOKKOS_HAVE_SERIAL )
+#if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
 #include <stdlib.h>
 #include <stdexcept>
 #include <iostream>
@@ -252,6 +253,12 @@ void Task::schedule()
 
   if ( ok_state && ok_list ) {
 
+    if ( TASK_STATE_CONSTRUCTING == m_state ) {
+      // Initial scheduling increment,
+      // matched by decrement when task is complete.
+      ++m_ref_count ;
+    }
+
     // Will be waiting for execution upon return from this function
 
     m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
@@ -286,7 +293,8 @@ void Task::execute_ready_tasks()
     // Task * task ;
     // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
 
-    Task * const task = s_ready ;
+    Task * task = s_ready ;
+
     s_ready = task->m_next ;
 
     task->m_next = 0 ;
@@ -325,6 +333,9 @@ void Task::execute_ready_tasks()
 
         x = next ;
       }
+
+      // Decrement to match the initial scheduling increment
+      assign( & task , 0 );
     }
   }
 }
@@ -333,4 +344,5 @@ void Task::execute_ready_tasks()
 } // namespace Experimental
 } // namespace Kokkos
 
-#endif // defined( KOKKOS_HAVE_SERIAL )
+#endif /* #if defined( KOKKOS_HAVE_SERIAL ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
index 3171449c16..a333f948ae 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
@@ -43,10 +43,11 @@
 
 // Experimental unified task-data parallel manycore LDRD
 
-#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP
-#define KOKKOS_SERIAL_TASKPOLICY_HPP
+#ifndef KOKKOS_EXPERIMENTAL_SERIAL_TASKPOLICY_HPP
+#define KOKKOS_EXPERIMENTAL_SERIAL_TASKPOLICY_HPP
 
 #include <Kokkos_Macros.hpp>
+
 #if defined( KOKKOS_HAVE_SERIAL )
 
 #include <string>
@@ -57,6 +58,8 @@
 #include <Kokkos_TaskPolicy.hpp>
 #include <Kokkos_View.hpp>
 
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
 //----------------------------------------------------------------------------
@@ -668,6 +671,7 @@ void wait( TaskPolicy< Kokkos::Serial > & )
 
 //----------------------------------------------------------------------------
 
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* defined( KOKKOS_HAVE_SERIAL ) */
-#endif /* #define KOKKOS_SERIAL_TASK_HPP */
+#endif /* #define KOKKOS_EXPERIMENTAL_SERIAL_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
index 399b633be9..0bc2864ff1 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -46,93 +46,84 @@
 
 #include <impl/Kokkos_Traits.hpp>
 #include <Kokkos_Core_fwd.hpp>
+#include <type_traits>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class C , class Enable = void >
-struct is_memory_space_enable
-{ typedef std::false_type type ; };
-
-template< class C >
-struct is_memory_space_enable< C ,
-  typename std::enable_if<
-    std::is_same< C , typename C::memory_space >::value
-  >::type >
-{ typedef std::true_type type ; };
-
-
-template< class C , class Enable = void >
-struct is_execution_space_enable
-{ typedef std::false_type type ; };
-
-template< class C >
-struct is_execution_space_enable< C ,
-  typename std::enable_if<
-    std::is_same< C , typename C::execution_space >::value
-  >::type >
-{ typedef std::true_type type ; };
-
-
-template< class C , class Enable = void >
-struct is_execution_policy_enable
-{ typedef std::false_type type ; };
+/** KOKKOS_HAVE_TYPE( Type )
+ *
+ * defines a meta-function that check if a type expose an internal typedef or
+ * type alias which matches Type
+ *
+ * e.g.
+ *   KOKKOS_HAVE_TYPE( array_layout );
+ *   struct Foo { using array_layout = void; };
+ *   have_array_layout<Foo>::value == 1;
+ */
+#define KOKKOS_HAVE_TYPE( Type )                                                \
+template <typename T>                                                           \
+struct have_##Type {                                                            \
+  template <typename U> static std::false_type have_type(...);                  \
+  template <typename U> static std::true_type  have_type( typename U::Type* );  \
+  using type = decltype(have_type<T>(nullptr));                                 \
+  static constexpr bool value = type::value;                                    \
+}
 
-template< class C >
-struct is_execution_policy_enable< C ,
-  typename std::enable_if<
-    std::is_same< C , typename C::execution_policy >::value
-  >::type >
-{ typedef std::true_type type ; };
+/** KOKKOS_IS_CONCEPT( Concept )
+ *
+ * defines a meta-function that check if a type match the given Kokkos concept
+ * type alias which matches Type
+ *
+ * e.g.
+ *   KOKKOS_IS_CONCEPT( array_layout );
+ *   struct Foo { using array_layout = Foo; };
+ *   is_array_layout<Foo>::value == 1;
+ */
+#define KOKKOS_IS_CONCEPT( Concept )                                            \
+template <typename T>                                                           \
+struct is_##Concept {                                                           \
+  template <typename U> static std::false_type have_concept(...);               \
+  template <typename U> static auto have_concept( typename U::Concept* )        \
+                          ->typename std::is_same<T, typename U::Concept>::type;\
+  using type = decltype(have_concept<T>(nullptr));                              \
+  static constexpr bool value = type::value;                                    \
+}
 
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
-template< class C , class Enable = void >
-struct is_array_layout_enable
-{ typedef std::false_type type ; };
+namespace Kokkos { namespace Impl {
 
-template< class C >
-struct is_array_layout_enable< C ,
-  typename std::enable_if<
-    std::is_same< C , typename C::array_layout >::value
-  >::type >
-{ typedef std::true_type type ; };
+template <typename T>
+using is_void = std::is_same<void,T>;
 
+// is_memory_space<T>::value
+KOKKOS_IS_CONCEPT( memory_space );
 
-template< class C , class Enable = void >
-struct is_memory_traits_enable
-{ typedef std::false_type type ; };
+// is_memory_traits<T>::value
+KOKKOS_IS_CONCEPT( memory_traits );
 
-template< class C >
-struct is_memory_traits_enable< C ,
-  typename std::enable_if<
-    std::is_same< C , typename C::memory_traits >::value
-  >::type >
-{ typedef std::true_type type ; };
+// is_execution_space<T>::value
+KOKKOS_IS_CONCEPT( execution_space );
 
+// is_execution_policy<T>::value
+KOKKOS_IS_CONCEPT( execution_policy );
 
-template< class C >
-using is_memory_space = typename is_memory_space_enable<C>::type ;
+// is_array_layout<T>::value
+KOKKOS_IS_CONCEPT( array_layout );
 
-template< class C >
-using is_execution_space = typename is_execution_space_enable<C>::type ;
+// is_iteration_pattern<T>::value
+KOKKOS_IS_CONCEPT( iteration_pattern );
 
-template< class C >
-using is_execution_policy = typename is_execution_policy_enable<C>::type ;
+// is_schedule_type<T>::value
+KOKKOS_IS_CONCEPT( schedule_type );
 
-template< class C >
-using is_array_layout = typename is_array_layout_enable<C>::type ;
+// is_index_type<T>::value
+KOKKOS_IS_CONCEPT( index_type );
 
-template< class C >
-using is_memory_traits = typename is_memory_traits_enable<C>::type ;
+}} // namespace Kokkos::Impl
 
-}
-}
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
new file mode 100644
index 0000000000..663bb1985d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -0,0 +1,499 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_IMPL_TASKQUEUE_HPP
+#define KOKKOS_IMPL_TASKQUEUE_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename > class TaskPolicy ;
+
+template< typename Arg1 = void , typename Arg2 = void > class Future ;
+
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename , typename , typename > class TaskBase ;
+template< typename > class TaskExec ;
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Space >
+class TaskQueueSpecialization ;
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is deferred to the TaskQueueSpecialization.
+ *  All other aspects of task management have shared implementation.
+ */
+template< typename ExecSpace >
+class TaskQueue {
+private:
+
+  friend class TaskQueueSpecialization< ExecSpace > ;
+  friend class Kokkos::TaskPolicy< ExecSpace > ;
+
+  using execution_space = ExecSpace ;
+  using specialization  = TaskQueueSpecialization< execution_space > ;
+  using memory_space    = typename specialization::memory_space ;
+  using device_type     = Kokkos::Device< execution_space , memory_space > ;
+  using memory_pool     = Kokkos::Experimental::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase<execution_space,void,void> ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { NumQueue = 3 };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool               m_memory ;
+  task_root_type * volatile m_ready[ NumQueue ][ 2 ];
+  long                      m_accum_alloc ; // Accumulated number of allocations
+  int                       m_count_alloc ; // Current number of allocations
+  int                       m_max_alloc ;   // Maximum number of allocations
+  int                       m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue
+    ( const memory_space & arg_space
+    , unsigned const arg_memory_pool_capacity
+    , unsigned const arg_memory_pool_superblock_capacity_log2
+    );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION
+  void schedule( task_root_type * const );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+  KOKKOS_FUNCTION
+  static bool push_task( task_root_type * volatile * const
+                       , task_root_type * const );
+
+  KOKKOS_FUNCTION
+  static task_root_type * pop_task( task_root_type * volatile * const );
+
+  KOKKOS_FUNCTION static
+  void decrement( task_root_type * task );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  // Assign task pointer with reference counting of assigned tasks
+  template< typename LV , typename RV >
+  KOKKOS_FUNCTION static
+  void assign( TaskBase< execution_space,LV,void> ** const lhs
+             , TaskBase< execution_space,RV,void> *  const rhs )
+    {
+      using task_lhs = TaskBase< execution_space,LV,void> ;
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
+          , uintptr_t( lhs ? *lhs : 0 )
+          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
+          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
+          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
+          , uintptr_t(rhs)
+          , uintptr_t( rhs ? rhs->m_next : 0 )
+          , int( rhs ? rhs->m_task_type : 0 )
+          , int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs ) decrement( *lhs );
+      if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count) , 1 ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskBase< void , void , void > {
+public:
+  enum : int16_t   { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
+  enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
+};
+
+/** \brief  Base class for task management, access, and execution.
+ *
+ *  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    // Enable a Future to access result data
+ *    TaskBase< Space , ResultType , void >
+ *      : TaskBase< void , void , void >
+ *      { ... };
+ *
+ *    // Enable a functor to access the base class
+ *    TaskBase< Space , ResultType , FunctorType >
+ *      : TaskBase< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *
+ *  States of a task:
+ *
+ *    Constructing State, NOT IN a linked list
+ *      m_wait == 0
+ *      m_next == 0
+ *
+ *    Scheduling transition : Constructing -> Waiting
+ *      before:
+ *        m_wait == 0
+ *        m_next == this task's initial dependence, 0 if none
+ *      after:
+ *        m_wait == EndTag
+ *        m_next == EndTag
+ *
+ *    Waiting State, IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == next of linked list of tasks
+ *
+ *    transition : Waiting -> Executing
+ *      before:
+ *        m_next == EndTag
+ *      after::
+ *        m_next == LockTag
+ *
+ *    Executing State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == LockTag
+ *
+ *    Respawn transition : Executing -> Executing-Respawn
+ *      before:
+ *        m_next == LockTag
+ *      after:
+ *        m_next == this task's updated dependence, 0 if none
+ *
+ *    Executing-Respawn State, NOT IN a linked list
+ *      m_apply != 0
+ *      m_queue != 0
+ *      m_ref_count > 0
+ *      m_wait == head of linked list of tasks waiting on this task
+ *      m_next == this task's updated dependence, 0 if none
+ *
+ *    transition : Executing -> Complete
+ *      before:
+ *        m_wait == head of linked list
+ *      after:
+ *        m_wait == LockTag
+ *
+ *    Complete State, NOT IN a linked list
+ *      m_wait == LockTag: cannot add dependence
+ *      m_next == LockTag: not a member of a wait queue
+ *
+ */
+template< typename ExecSpace >
+class TaskBase< ExecSpace , void , void >
+{
+public:
+
+  enum : int16_t   { TaskTeam   = TaskBase<void,void,void>::TaskTeam
+                   , TaskSingle = TaskBase<void,void,void>::TaskSingle
+                   , Aggregate  = TaskBase<void,void,void>::Aggregate };
+
+  enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag
+                   , EndTag  = TaskBase<void,void,void>::EndTag };
+
+  using execution_space = ExecSpace ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  template< typename > friend class Kokkos::TaskPolicy ;
+
+  typedef void (* function_type) ( TaskBase * , void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;     ///< Apply function pointer
+  queue_type   * m_queue ;     ///< Queue in which this task resides
+  TaskBase     * m_wait ;      ///< Linked list of tasks waiting on this
+  TaskBase     * m_next ;      ///< Waiting linked-list next
+  int32_t        m_ref_count ; ///< Reference count
+  int32_t        m_alloc_size ;///< Allocation size
+  int32_t        m_dep_count ; ///< Aggregate's number of dependences
+  int16_t        m_task_type ; ///< Type of task
+  int16_t        m_priority ;  ///< Priority of runnable task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase() noexcept
+    : m_apply(0)
+    , m_queue(0)
+    , m_wait(0)
+    , m_next(0)
+    , m_ref_count(0)
+    , m_alloc_size(0)
+    , m_dep_count(0)
+    , m_task_type( TaskSingle )
+    , m_priority( 1 /* TaskRegularPriority */ )
+    {}
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase ** aggregate_dependences()
+    { return reinterpret_cast<TaskBase**>( this + 1 ); }
+
+  using get_return_type = void ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const {}
+};
+
+template < typename ExecSpace , typename ResultType >
+class TaskBase< ExecSpace , ResultType , void >
+  : public TaskBase< ExecSpace , void , void >
+{
+private:
+
+  static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+public:
+
+  ResultType   m_result ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase()
+    : TaskBase< ExecSpace , void , void >()
+    , m_result()
+    {}
+
+  using get_return_type = ResultType const & ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const { return m_result ; }
+};
+
+
+template< typename ExecSpace , typename ResultType , typename FunctorType >
+class TaskBase
+  : public TaskBase< ExecSpace , ResultType , void >
+  , public FunctorType
+{
+private:
+
+  TaskBase() = delete ;
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+public:
+
+  using root_type    = TaskBase< ExecSpace , void , void > ;
+  using base_type    = TaskBase< ExecSpace , ResultType , void > ;
+  using member_type  = TaskExec< ExecSpace > ;
+  using functor_type = FunctorType ;
+  using result_type  = ResultType ;
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_functor
+    ( Type * const task
+    , typename std::enable_if
+        < std::is_same< typename Type::result_type , void >::value
+        , member_type * const 
+        >::type member
+    )
+    {
+      using fType = typename Type::functor_type ;
+      static_cast<fType*>(task)->operator()( *member );
+    }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_functor
+    ( Type * const task
+    , typename std::enable_if
+        < ! std::is_same< typename Type::result_type , void >::value
+        , member_type * const 
+        >::type member
+    )
+    {
+      using fType = typename Type::functor_type ;
+      static_cast<fType*>(task)->operator()( *member , task->m_result );
+    }
+
+  KOKKOS_FUNCTION static
+  void apply( root_type * root , void * exec )
+    {
+      TaskBase    * const lock   = reinterpret_cast< TaskBase * >( root_type::LockTag );
+      TaskBase    * const task   = static_cast< TaskBase * >( root );
+      member_type * const member = reinterpret_cast< member_type * >( exec );
+
+      TaskBase::template apply_functor( task , member );
+
+      // Task may be serial or team.
+      // If team then must synchronize before querying task->m_next.
+      // If team then only one thread calls destructor.
+
+      member->team_barrier();
+
+      if ( 0 == member->team_rank() && lock == task->m_next ) {
+        // Did not respawn, destroy the functor to free memory
+        static_cast<functor_type*>(task)->~functor_type();
+        // Cannot destroy the task until its dependences
+        // have been processed.
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase( FunctorType const & arg_functor )
+    : base_type()
+    , FunctorType( arg_functor )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~TaskBase() {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_TASKQUEUE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
new file mode 100644
index 0000000000..70a880d4a2
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -0,0 +1,569 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( const TaskQueue< ExecSpace >::memory_space & arg_space
+  , unsigned const arg_memory_pool_capacity
+  , unsigned const arg_memory_pool_superblock_capacity_log2
+  )
+  : m_memory( arg_space
+            , arg_memory_pool_capacity
+            , arg_memory_pool_superblock_capacity_log2 )
+  , m_ready()
+  , m_accum_alloc(0)
+  , m_max_alloc(0)
+  , m_ready_count(0)
+{
+  for ( int i = 0 ; i < NumQueue ; ++i ) {
+    m_ready[i][0] = (task_root_type *) task_root_type::EndTag ;
+    m_ready[i][1] = (task_root_type *) task_root_type::EndTag ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that queues are empty and ready count is zero
+
+  for ( int i = 0 ; i < NumQueue ; ++i ) {
+    for ( int j = 0 ; j < 2 ; ++j ) {
+      if ( m_ready[i][j] != (task_root_type *) task_root_type::EndTag ) {
+        Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready tasks");
+      }
+    }
+  }
+
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::decrement
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1);
+
+#if 0
+  if ( 1 == count ) {
+    printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
+          , uintptr_t( task )
+          , uintptr_t( task->m_next )
+          , int( task->m_task_type )
+          , int( task->m_ref_count )
+          );
+  }
+#endif
+
+  if ( ( 1 == count ) && 
+       ( task->m_next == (task_root_type *) task_root_type::LockTag ) ) {
+    // Reference count is zero and task is complete, deallocate.
+    task->m_queue->deallocate( task , task->m_alloc_size );
+  }   
+  else if ( count <= 1 ) { 
+    Kokkos::abort("TaskPolicy task has negative reference count or is incomplete" );
+  }   
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p , size_t n )
+{
+  m_memory.deallocate( p , n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+bool TaskQueue< ExecSpace >::push_task
+  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue
+  , TaskQueue< ExecSpace >::task_root_type * const task
+  )
+{
+  // Push task into a concurrently pushed and popped queue.
+  // The queue is a linked list where 'task->m_next' form the links.
+  // Fail the push attempt if the queue is locked;
+  // otherwise retry until the push succeeds.
+
+#if 0
+  printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+        , uintptr_t(queue)
+        , uintptr_t(*queue)
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  task_root_type * volatile * const next = & task->m_next ;
+
+  if ( zero != *next ) {
+    Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
+  }
+
+  task_root_type * y = *queue ;
+
+  while ( lock != y ) {
+
+    *next = y ;
+
+    // Do not proceed until '*next' has been stored.
+    Kokkos::memory_fence();
+
+    task_root_type * const x = y ;
+
+    y = Kokkos::atomic_compare_exchange(queue,y,task);
+
+    if ( x == y ) return true ;
+  }
+
+  // Failed, replace 'task->m_next' value since 'task' remains
+  // not a member of a queue.
+
+  *next = zero ;
+
+  // Do not proceed until '*next' has been stored.
+  Kokkos::memory_fence();
+
+  return false ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+typename TaskQueue< ExecSpace >::task_root_type *
+TaskQueue< ExecSpace >::pop_task
+  ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
+{
+  // Pop task from a concurrently pushed and popped queue.
+  // The queue is a linked list where 'task->m_next' form the links.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  // *queue is
+  //   end   => an empty queue
+  //   lock  => a locked queue
+  //   valid
+
+  // Retry until the lock is acquired or the queue is empty.
+
+  task_root_type * task = *queue ;
+
+  while ( end != task ) {
+
+    // The only possible values for the queue are
+    // (1) lock, (2) end, or (3) a valid task.
+    // Thus zero will never appear in the queue.
+    //
+    // If queue is locked then just read by guaranteeing
+    // the CAS will fail.
+
+    if ( lock == task ) task = 0 ;
+
+    task_root_type * const x = task ;
+
+    task = Kokkos::atomic_compare_exchange(queue,task,lock);
+
+    if ( x == task ) break ; // CAS succeeded and queue is locked
+  }
+
+  if ( end != task ) {
+
+    // This thread has locked the queue and removed 'task' from the queue.
+    // Extract the next entry of the queue from 'task->m_next'
+    // and mark 'task' as popped from a queue by setting
+    // 'task->m_next = lock'.
+
+    task_root_type * const next =
+      Kokkos::atomic_exchange( & task->m_next , lock );
+
+    // Place the next entry in the head of the queue,
+    // which also unlocks the queue.
+
+    task_root_type * const unlock =
+      Kokkos::atomic_exchange( queue , next );
+
+    if ( next == zero || next == lock || lock != unlock ) {
+      Kokkos::abort("TaskQueue::pop_task ERROR");
+    }
+  }
+
+#if 0
+  if ( end != task ) {
+    printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+          , uintptr_t(queue)
+          , uintptr_t(task)
+          , uintptr_t(task->m_wait)
+          , uintptr_t(task->m_next)
+          , int(task->m_task_type)
+          , int(task->m_priority)
+          , int(task->m_ref_count) );
+  }
+#endif
+
+  return task ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule a runnable or when_all task upon construction / spawn
+  // and upon completion of other tasks that 'task' is waiting on.
+
+  // Precondition on runnable task state:
+  //   task is either constructing or executing
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence
+  //   Executing-respawn state:
+  //     task->m_wait == head of linked list
+  //     task->m_next == dependence
+  //
+  //  Task state transition:
+  //     Constructing      ->  Waiting
+  //     Executing-respawn ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list
+  //     task->m_next == member of linked list
+
+#if 0
+  printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+  //----------------------------------------
+  {
+    // If Constructing then task->m_wait == 0
+    // Change to waiting by task->m_wait = EndTag
+
+    task_root_type * const init =
+      Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
+
+    // Precondition
+
+    if ( lock == init ) {
+      Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
+    }
+
+    // if ( init == 0 ) Constructing       ->  Waiting
+    // else             Executing-Respawn  ->  Waiting
+  }
+  //----------------------------------------
+
+  if ( task_root_type::Aggregate != task->m_task_type ) {
+
+    // Scheduling a runnable task which may have a depencency 'dep'.
+    // Extract dependence, if any, from task->m_next.
+    // If 'dep' is not null then attempt to push 'task'
+    // into the wait queue of 'dep'.
+    // If the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
+    // If the push fails then 'dep' is complete and 'task'
+    // is ready to execute.
+
+    task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+
+    const bool is_ready = 
+      ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+    // Reference count for dep was incremented when assigned
+    // to task->m_next so that if it completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+
+    if ( is_ready ) {
+
+      // No dependence or 'dep' is complete so push task into ready queue.
+      // Increment the ready count before pushing into ready queue
+      // to track number of ready + executing tasks.
+      // The ready count will be decremented when the task is complete.
+
+      Kokkos::atomic_increment( & m_ready_count );
+
+      task_root_type * volatile * const queue =
+        & m_ready[ task->m_priority ][ task->m_task_type ];
+
+      // A push_task fails if the ready queue is locked.
+      // A ready queue is only locked during a push or pop;
+      // i.e., it is never permanently locked.
+      // Retry push to ready queue until it succeeds.
+      // When the push succeeds then 'task' may be
+      // processed or executed by another thread at any time.
+
+      while ( ! push_task( queue , task ) );
+    }
+  }
+  //----------------------------------------
+  else {
+    // Scheduling a 'when_all' task with multiple dependences.
+    // This scheduling may be called when the 'when_all' is
+    // (1) created or
+    // (2) being removed from a completed task's wait list.
+
+    task_root_type ** const aggr = task->aggregate_dependences();
+
+    // Assume the 'when_all' is complete until a dependence is
+    // found that is not complete.
+
+    bool is_complete = true ;
+
+    for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
+
+      --i ;
+
+      // Loop dependences looking for an incomplete task.
+      // Add this task to the incomplete task's wait queue.
+
+      // Remove a task 'x' from the dependence list.
+      // The reference count of 'x' was incremented when
+      // it was assigned into the dependence list.
+
+      task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+
+      if ( x ) {
+
+        // If x->m_wait is not locked then push succeeds
+        // and the aggregate is not complete.
+        // If the push succeeds then this when_all 'task' may be
+        // processed by another thread at any time.
+        // For example, 'x' may be completeed by another
+        // thread and then re-schedule this when_all 'task'.
+
+        is_complete = ! push_task( & x->m_wait , task );
+
+        // Decrement reference count which had been incremented
+        // when 'x' was added to the dependence list.
+
+        TaskQueue::assign( & x , zero );
+      }
+    }
+
+    if ( is_complete ) {
+      // The when_all 'task' was not added to a wait queue because
+      // all dependences were complete so this aggregate is complete.
+      // Complete the when_all 'task' to schedule other tasks
+      // that are waiting for the when_all 'task' to complete.
+
+      task->m_next = lock ;
+
+      complete( task );
+
+      // '*task' may have been deleted upon completion
+    }
+  }
+  //----------------------------------------
+  // Postcondition:
+  //   A runnable 'task' was pushed into a wait or ready queue.
+  //   An aggregate 'task' was either pushed to a wait queue
+  //   or completed.
+  // Concurrent execution may have already popped 'task'
+  // from a queue and processed it as appropriate.
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if 0
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+  fflush( stdout );
+#endif
+
+  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != task->m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task , zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+
+        // Set x->m_next = zero  <=  no dependence
+
+        task_root_type * const next =
+          (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
index 80a326f080..1f14e42874 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -109,6 +109,9 @@ public:
 };
 
 } // namespace Impl
+
+  using Kokkos::Impl::Timer ;
+
 } // namespace Kokkos
 
 #endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
index b9e07a82de..278f715bc9 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -47,6 +47,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <Kokkos_Macros.hpp>
+#include <string>
 #include <type_traits>
 
 namespace Kokkos {
@@ -357,9 +358,31 @@ struct is_integral : public integral_constant< bool ,
     std::is_same< T , uint64_t >::value 
   )>
 {};
-
 //----------------------------------------------------------------------------
 
+template<typename T>
+struct is_label : public false_type {};
+
+template<>
+struct is_label<const char*> : public true_type {};
+
+template<>
+struct is_label<char*> : public true_type {};
+
+
+template<int N>
+struct is_label<const char[N]> : public true_type {};
+
+template<int N>
+struct is_label<char[N]> : public true_type {};
+
+
+template<>
+struct is_label<const std::string> : public true_type {};
+
+template<>
+struct is_label<std::string> : public true_type {};
+
 // These 'constexpr'functions can be used as
 // both regular functions and meta-function.
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
index 452af66cde..8b63039f57 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
@@ -115,135 +115,6 @@ template< class ExecSpace , class Type , bool Initialize >
 struct ViewDefaultConstruct
 { ViewDefaultConstruct( Type * , size_t ) {} };
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-/** \brief  ViewDataHandle provides the type of the 'data handle' which the view
- *          uses to access data with the [] operator. It also provides
- *          an allocate function and a function to extract a raw ptr from the
- *          data handle. ViewDataHandle also defines an enum ReferenceAble which
- *          specifies whether references/pointers to elements can be taken and a
- *          'return_type' which is what the view operators will give back.
- *          Specialisation of this object allows three things depending
- *          on ViewTraits and compiler options:
- *          (i)   Use special allocator (e.g. huge pages/small pages and pinned memory)
- *          (ii)  Use special data handle type (e.g. add Cuda Texture Object)
- *          (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads)
- */
-template< class StaticViewTraits , class Enable = void >
-struct ViewDataHandle {
-
-  enum { ReturnTypeIsReference = true };
-
-  typedef typename StaticViewTraits::value_type * handle_type;
-  typedef typename StaticViewTraits::value_type & return_type;
-
-  KOKKOS_INLINE_FUNCTION
-  static handle_type create_handle( typename StaticViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
-  {
-    return handle_type(arg_data_ptr);
-  }
-};
-
-template< class StaticViewTraits , class Enable = void >
-class ViewDataManagement : public ViewDataHandle< StaticViewTraits > {
-private:
-
-  template< class , class > friend class ViewDataManagement ;
-
-  struct PotentiallyManaged  {};
-  struct StaticallyUnmanaged {};
-
-  /* Statically unmanaged if traits or not executing in host-accessible memory space */
-  typedef typename
-    Impl::if_c< StaticViewTraits::is_managed &&
-                Impl::is_same< Kokkos::HostSpace
-                             , Kokkos::Impl::ActiveExecutionMemorySpace >::value
-              , PotentiallyManaged
-              , StaticallyUnmanaged
-              >::type StaticManagementTag ;
-
-  enum { Unmanaged     = 0x01
-       , Noncontiguous = 0x02
-       };
-
-  enum { DefaultTraits = Impl::is_same< StaticManagementTag , StaticallyUnmanaged >::value ? Unmanaged : 0 };
-
-  unsigned m_traits ; ///< Runtime traits
-
-
-  template< class T >
-  inline static
-  unsigned assign( const ViewDataManagement<T> & rhs , const PotentiallyManaged & )
-    { return rhs.m_traits | ( rhs.is_managed() && Kokkos::HostSpace::in_parallel() ? unsigned(Unmanaged) : 0u ); }
-
-  template< class T >
-  KOKKOS_INLINE_FUNCTION static
-  unsigned assign( const ViewDataManagement<T> & rhs , const StaticallyUnmanaged & )
-    { return rhs.m_traits | Unmanaged ; }
-
-public:
-
-  typedef typename ViewDataHandle< StaticViewTraits >::handle_type handle_type;
-
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement() : m_traits( DefaultTraits ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement( const ViewDataManagement & rhs )
-    : m_traits( assign( rhs , StaticManagementTag() ) ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement & operator = ( const ViewDataManagement & rhs )
-    { m_traits = assign( rhs , StaticManagementTag() ); return *this ; }
-
-  template< class SVT >
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement( const ViewDataManagement<SVT> & rhs )
-    : m_traits( assign( rhs , StaticManagementTag() ) ) {}
-
-  template< class SVT >
-  KOKKOS_INLINE_FUNCTION
-  ViewDataManagement & operator = ( const ViewDataManagement<SVT> & rhs )
-    { m_traits = assign( rhs , StaticManagementTag() ); return *this ; }
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_managed() const { return ! ( m_traits & Unmanaged ); }
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_contiguous() const { return ! ( m_traits & Noncontiguous ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void set_unmanaged() { m_traits |= Unmanaged ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void set_noncontiguous() { m_traits |= Noncontiguous ; }
-
-  template< bool Initialize >
-  static
-  handle_type allocate(  const std::string & label
-                       , const Impl::ViewOffset< typename StaticViewTraits::shape_type, typename StaticViewTraits::array_layout > & offset_map
-                       , AllocationTracker & tracker
-               )
-    {
-      typedef typename StaticViewTraits::execution_space  execution_space ;
-      typedef typename StaticViewTraits::memory_space     memory_space ;
-      typedef typename StaticViewTraits::value_type       value_type ;
-
-      const size_t count = offset_map.capacity();
-
-      tracker = memory_space::allocate_and_track( label, sizeof(value_type) * count );
-
-      value_type * ptr = reinterpret_cast<value_type *>(tracker.alloc_ptr());
-
-      // Default construct within the view's execution space.
-      (void) ViewDefaultConstruct< execution_space , value_type , Initialize >( ptr , count );
-
-      return ViewDataHandle< StaticViewTraits >::create_handle(ptr, tracker);
-    }
-};
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 } // namespace Impl
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
index 0fc3e22b9e..61d2e35702 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
@@ -46,164 +46,11 @@
 
 #include <impl/KokkosExp_ViewTile.hpp>
 
-#if KOKKOS_USING_EXP_VIEW
-
 namespace Kokkos {
 
 using Kokkos::Experimental::tile_subview ;
 
 }
 
-#else
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class T , unsigned N0 , unsigned N1 , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< T , void , LayoutTileLeft<N0,N1> , MemorySpace , MemoryTraits >
-{
-  typedef ViewDefault type ;
-};
-
-struct ViewTile {};
-
-template< class ShapeType , unsigned N0 , unsigned N1 >
-struct ViewOffset< ShapeType
-                 , LayoutTileLeft<N0,N1,true> /* Only accept properly shaped tiles */
-                 , typename Impl::enable_if<( 2 == ShapeType::rank
-                                              &&
-                                              2 == ShapeType::rank_dynamic
-                                            )>::type >
-  : public ShapeType
-{
-  enum { SHIFT_0 = Impl::integral_power_of_two(N0) };
-  enum { SHIFT_1 = Impl::integral_power_of_two(N1) };
-  enum { MASK_0  = N0 - 1 };
-  enum { MASK_1  = N1 - 1 };
-
-  typedef size_t                      size_type ;
-  typedef ShapeType                   shape_type ;
-  typedef LayoutTileLeft<N0,N1,true>  array_layout ;
-
-  enum { has_padding = true };
-
-  size_type tile_N0 ;
-
-  KOKKOS_INLINE_FUNCTION
-  void assign( const ViewOffset & rhs )
-    {
-      shape_type::N0 = rhs.N0 ;
-      shape_type::N1 = rhs.N1 ;
-      tile_N0 = ( rhs.N0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void assign( size_t n0 , size_t n1
-             , int = 0 , int = 0
-             , int = 0 , int = 0
-             , int = 0 , int = 0
-             , int = 0
-             )
-    {
-      shape_type::N0 = n0 ;
-      shape_type::N1 = n1 ;
-      tile_N0 = ( n0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension
-    }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void set_padding() {}
-
-
-  template< typename I0 , typename I1 >
-  KOKKOS_INLINE_FUNCTION
-  size_type operator()( I0 const & i0 , I1 const & i1
-                      , int = 0 , int = 0
-                      , int = 0 , int = 0
-                      , int = 0 , int = 0
-                      ) const
-    {
-      return /* ( ( Tile offset                             ) *  ( Tile size       ) ) */
-                ( ( (i0>>SHIFT_0) + tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) ) +
-             /* ( Offset within tile                       ) */
-                ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ;
-    }
-
-  template< typename I0 , typename I1 >
-  KOKKOS_INLINE_FUNCTION
-  size_type tile_begin( I0 const & i_tile0 , I1 const & i_tile1 ) const
-    {
-      return ( i_tile0 + tile_N0 * i_tile1 ) << ( SHIFT_0 + SHIFT_1 );
-    }
-
-
-  KOKKOS_INLINE_FUNCTION
-  size_type capacity() const
-    {
-      // ( TileDim0 * ( TileDim1 ) ) * TileSize
-      return ( tile_N0 * ( ( shape_type::N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 );
-    }
-};
-
-template<>
-struct ViewAssignment< ViewTile , void , void >
-{
-  // Some compilers have type-matching issues on the integer values when using:
-  //   template< class T , unsigned N0 , unsigned N1 , class A2 , class A3 >
-  template< class T , unsigned dN0 , unsigned dN1
-          , class A2 , class A3
-          , unsigned sN0 , unsigned sN1 >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment( View< T[dN0][dN1], LayoutLeft, A2, A3, Impl::ViewDefault > & dst
-                , View< T** , LayoutTileLeft<sN0,sN1,true>, A2, A3, Impl::ViewDefault > const & src
-                , size_t const i_tile0
-                , typename Impl::enable_if< unsigned(dN0) == unsigned(sN0) &&
-                                            unsigned(dN1) == unsigned(sN1)
-                                          , size_t const
-                                          >::type i_tile1
-                )
-   {
-     // Destination is always contiguous but source may be non-contiguous
-     // so don't assign the whole view management object.
-     // Just query and appropriately set the reference-count state.
-
-     if ( ! src.m_management.is_managed() ) dst.m_management.set_unmanaged();
-
-     dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map.tile_begin(i_tile0,i_tile1);
-
-     dst.m_tracker = src.m_tracker;
-   }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-
-template< class T , unsigned N0, unsigned N1, class A2, class A3 >
-KOKKOS_INLINE_FUNCTION
-View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault >
-tile_subview( const View<T**,LayoutTileLeft<N0,N1,true>,A2,A3,Impl::ViewDefault> & src
-            , const size_t i_tile0
-            , const size_t i_tile1
-            )
-{
-  View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > dst ;
-
-  (void) Impl::ViewAssignment< Impl::ViewTile , void , void >( dst , src , i_tile0 , i_tile1 );
-
-  return dst ;
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-
 #endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
 
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
index e835245e25..5bb2b672e1 100644
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -44,7 +44,7 @@ ENDIF()
 IF(Kokkos_ENABLE_OpenMP)
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMP
-    SOURCES UnitTestMain.cpp TestOpenMP.cpp
+    SOURCES UnitTestMain.cpp TestOpenMP.cpp TestOpenMP_a.cpp TestOpenMP_b.cpp TestOpenMP_c.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -66,7 +66,7 @@ ENDIF()
 IF(Kokkos_ENABLE_Cuda)
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_Cuda
-    SOURCES UnitTestMain.cpp TestCuda.cpp
+    SOURCES UnitTestMain.cpp TestCuda.cpp TestCuda_a.cpp TestCuda_b.cpp TestCuda_c.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -76,27 +76,30 @@ ENDIF()
 
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   UnitTest_Default
-  SOURCES UnitTestMain.cpp TestDefaultDeviceType.cpp TestDefaultDeviceTypeInit.cpp
+  SOURCES UnitTestMain.cpp TestDefaultDeviceType.cpp TestDefaultDeviceType_a.cpp
   COMM serial mpi
   NUM_MPI_PROCS 1
   FAIL_REGULAR_EXPRESSION "  FAILED  "
     TESTONLYLIBS kokkos_gtest
 )
 
+foreach(INITTESTS_NUM RANGE 1 16)
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_HWLOC
-  SOURCES UnitTestMain.cpp  TestHWLOC.cpp
+  UnitTest_DefaultInit_${INITTESTS_NUM}
+  SOURCES UnitTestMain.cpp TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp
   COMM serial mpi
   NUM_MPI_PROCS 1
   FAIL_REGULAR_EXPRESSION "  FAILED  "
     TESTONLYLIBS kokkos_gtest
 )
+endforeach(INITTESTS_NUM)
 
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest_AllocationTracker
-  SOURCES UnitTestMain.cpp  TestAllocationTracker.cpp
+  UnitTest_HWLOC
+  SOURCES UnitTestMain.cpp  TestHWLOC.cpp
   COMM serial mpi
   NUM_MPI_PROCS 1
   FAIL_REGULAR_EXPRESSION "  FAILED  "
-  TESTONLYLIBS kokkos_gtest
+    TESTONLYLIBS kokkos_gtest
 )
+
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
index 6e0f56a62c..3d9d212c1e 100644
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -61,17 +61,16 @@ OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_HWLOC
 TEST_TARGETS += test-hwloc
 
-OBJ_ALLOCATIONTRACKER = TestAllocationTracker.o UnitTestMain.o gtest-all.o
-TARGETS += KokkosCore_UnitTest_AllocationTracker
-TEST_TARGETS += test-allocationtracker
-
-OBJ_DEFAULT = TestDefaultDeviceType.o UnitTestMain.o gtest-all.o
+OBJ_DEFAULT = TestDefaultDeviceType.o TestDefaultDeviceType_a.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_Default
 TEST_TARGETS += test-default
 
-OBJ_DEFAULTINIT = TestDefaultDeviceTypeInit.o UnitTestMain.o gtest-all.o
-TARGETS += KokkosCore_UnitTest_DefaultInit
-TEST_TARGETS += test-default-init
+NUM_INITTESTS = 16
+INITTESTS_NUMBERS := $(shell seq 1 ${NUM_INITTESTS})
+INITTESTS_TARGETS := $(addprefix KokkosCore_UnitTest_DefaultDeviceTypeInit_,${INITTESTS_NUMBERS})
+TARGETS += ${INITTESTS_TARGETS}
+INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
+TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
 
 OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_Synchronic
@@ -101,8 +100,8 @@ KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DE
 KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Default
 
-KokkosCore_UnitTest_DefaultInit: $(OBJ_DEFAULTINIT) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULTINIT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultInit
+${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
 
 KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
@@ -131,8 +130,8 @@ test-allocationtracker: KokkosCore_UnitTest_AllocationTracker
 test-default: KokkosCore_UnitTest_Default
 	./KokkosCore_UnitTest_Default
 
-test-default-init: KokkosCore_UnitTest_DefaultInit
-	./KokkosCore_UnitTest_DefaultInit
+${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
+	./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
 
 test-synchronic: KokkosCore_UnitTest_Synchronic
 	./KokkosCore_UnitTest_Synchronic
diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp
index 1fbb4bf4b1..5388a60787 100644
--- a/lib/kokkos/core/unit_test/TestAggregate.hpp
+++ b/lib/kokkos/core/unit_test/TestAggregate.hpp
@@ -52,665 +52,6 @@
 
 /*--------------------------------------------------------------------------*/
 
-#if ! KOKKOS_USING_EXP_VIEW
-
-namespace Test {
-
-struct EmbedArray {};
-
-struct ArrayProxyContiguous {};
-struct ArrayProxyStrided {};
-
-template< typename T , unsigned N = 0 , class Proxy = void >
-struct Array ;
-
-template< typename T >
-struct Array<T,0,ArrayProxyContiguous>
-{
-public:
-  typedef T value_type ;
-
-  enum { StaticLength = 0 };
-  T * const value ;
-  const unsigned count ;
-
-  KOKKOS_INLINE_FUNCTION
-  Array( T * v , unsigned n ) : value(v), count(n) {}
-
-  template< class Proxy >
-  KOKKOS_INLINE_FUNCTION
-  Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; }
-};
-
-template< typename T , unsigned N >
-struct Array<T,N,ArrayProxyContiguous>
-{
-public:
-  typedef T value_type ;
-
-  enum { StaticLength = N };
-  T * const value ;
-
-  KOKKOS_INLINE_FUNCTION
-  Array( T * v , unsigned ) : value(v) {}
-
-  template< class Proxy >
-  KOKKOS_INLINE_FUNCTION
-  Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; }
-};
-
-template< typename T , unsigned N >
-struct Array<T,N,ArrayProxyStrided>
-{
-public:
-  typedef T value_type ;
-
-  enum { StaticLength = N };
-  T * const value ;
-  const unsigned stride ;
-
-  KOKKOS_INLINE_FUNCTION
-  Array( T * v , unsigned , unsigned s ) : value(v), stride(s) {}
-
-  template< class Proxy >
-  KOKKOS_INLINE_FUNCTION
-  Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; }
-};
-
-template< typename T >
-struct Array<T,0,ArrayProxyStrided>
-{
-public:
-  typedef T value_type ;
-
-  enum { StaticLength = 0 };
-  T * const value ;
-  const unsigned count ;
-  const unsigned stride ;
-
-  KOKKOS_INLINE_FUNCTION
-  Array( T * v , unsigned n , unsigned s ) : value(v), count(n), stride(s) {}
-
-  template< class Proxy >
-  KOKKOS_INLINE_FUNCTION
-  Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; }
-};
-
-template< typename T >
-struct Array<T,0,void>
-{
-public:
-  typedef T value_type ;
-
-  enum { StaticLength = 0 };
-  T * value ;
-  const unsigned count ;
-
-  KOKKOS_INLINE_FUNCTION
-  Array() : value(0) , count(0) {}
-
-  template< unsigned N , class Proxy >
-  KOKKOS_INLINE_FUNCTION
-  Array( const Array<T,N,Proxy> & rhs ) : value(rhs.value), count(N) {}
-};
-
-template< typename T , unsigned N >
-struct Array<T,N,void>
-{
-public:
-  typedef T value_type ;
-
-  enum { StaticLength = N };
-  T value[N] ;
-
-  template< class Proxy >
-  KOKKOS_INLINE_FUNCTION
-  Array & operator = ( const Array<T,N,Proxy> & ) { return *this ; }
-};
-
-} // namespace Test
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename T , unsigned N >
-struct AnalyzeShape< Test::Array< T , N > >
-  : public ShapeInsert< typename AnalyzeShape< T >::shape , N >::type
-{
-private:
-  typedef AnalyzeShape< T > nested ;
-public:
-
-  typedef Test::EmbedArray specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
-
-  typedef typename nested::array_intrinsic_type   array_intrinsic_type[ N ];
-  typedef Test::Array< T , N >          value_type ;
-  typedef Test::Array< T , N >          type ;
-
-  typedef const array_intrinsic_type  const_array_intrinsic_type ;
-  typedef const value_type  const_value_type ;
-  typedef const type        const_type ;
-
-  typedef typename nested::non_const_array_intrinsic_type          non_const_array_intrinsic_type[ N ];
-  typedef Test::Array< typename nested::non_const_value_type , N > non_const_value_type ;
-  typedef Test::Array< typename nested::non_const_value_type , N > non_const_type ;
-};
-
-template< typename T >
-struct AnalyzeShape< Test::Array< T , 0 > >
-  : public ShapeInsert< typename AnalyzeShape< T >::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape< T > nested ;
-public:
-
-  typedef Test::EmbedArray specialize ;
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
-  typedef Test::Array< T , 0 >          value_type ;
-  typedef Test::Array< T , 0 >          type ;
-
-  typedef const array_intrinsic_type  const_array_intrinsic_type ;
-  typedef const value_type  const_value_type ;
-  typedef const type        const_type ;
-
-  typedef typename nested::non_const_array_intrinsic_type  * non_const_array_intrinsic_type ;
-  typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_value_type ;
-  typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_type ;
-};
-
-/*--------------------------------------------------------------------------*/
-
-template< class ValueType , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ValueType
-                     , Test::EmbedArray
-                     , LayoutLeft
-                     , MemorySpace
-                     , MemoryTraits >
-{ typedef Test::EmbedArray type ; };
-
-template< class ValueType , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ValueType
-                     , Test::EmbedArray
-                     , LayoutRight
-                     , MemorySpace
-                     , MemoryTraits >
-{ typedef Test::EmbedArray type ; };
-
-/*--------------------------------------------------------------------------*/
-
-template<>
-struct ViewAssignment< Test::EmbedArray , Test::EmbedArray , void >
-{
-  //------------------------------------
-  /** \brief  Compatible value and shape */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Test::EmbedArray> & dst
-                , const View<ST,SL,SD,SM,Test::EmbedArray> & src
-                , const typename enable_if<(
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::value
-                    )>::type * = 0
-                  )
-  {
-    dst.m_offset_map.assign( src.m_offset_map );
-
-    dst.m_ptr_on_device = src.m_ptr_on_device ;
-
-    dst.m_tracker = src.m_tracker;
-  }
-};
-
-template<>
-struct ViewAssignment< ViewDefault , Test::EmbedArray , void >
-{
-  //------------------------------------
-  /** \brief  Compatible value and shape */
-
-  template< class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment( typename View<ST,SL,SD,SM,Test::EmbedArray>::array_type & dst
-                , const View<ST,SL,SD,SM,Test::EmbedArray> & src
-                )
-  {
-    dst.m_offset_map.assign( src.m_offset_map );
-
-    dst.m_ptr_on_device = src.m_ptr_on_device ;
-
-    dst.m_tracker = src.m_tracker;
-  }
-};
-
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          class Arg3Type >
-class View< DataType , Arg1Type , Arg2Type , Arg3Type , Test::EmbedArray >
-  : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
-{
-public:
-
-  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
-
-private:
-
-  // Assignment of compatible views requirement:
-  template< class , class , class , class , class > friend class View ;
-
-  // Assignment of compatible subview requirement:
-  template< class , class , class > friend struct Impl::ViewAssignment ;
-
-  typedef Impl::ViewOffset< typename traits::shape_type ,
-                            typename traits::array_layout > offset_map_type ;
-
-  typedef Impl::ViewDataManagement< traits > view_data_management ;
-
-  // traits::value_type = Test::Array< T , N >
-
-  typename traits::value_type::value_type * m_ptr_on_device ;
-  offset_map_type                           m_offset_map ;
-  view_data_management                      m_management ;
-  Impl::AllocationTracker                   m_tracker ;
-
-public:
-
-  typedef View< typename traits::array_intrinsic_type ,
-                typename traits::array_layout ,
-                typename traits::execution_space ,
-                typename traits::memory_traits > array_type ;
-
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::execution_space ,
-                typename traits::memory_traits > non_const_type ;
-
-  typedef View< typename traits::const_data_type ,
-                typename traits::array_layout ,
-                typename traits::execution_space ,
-                typename traits::memory_traits > const_type ;
-
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::host_mirror_space ,
-                void > HostMirror ;
-
-  //------------------------------------
-  // Shape
-
-  enum { Rank = traits::rank - 1 };
-
-  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const
-  {
-    return   m_offset_map.N0
-           * m_offset_map.N1
-           * m_offset_map.N2
-           * m_offset_map.N3
-           * m_offset_map.N4
-           * m_offset_map.N5
-           * m_offset_map.N6
-           * m_offset_map.N7
-           ;
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type dimension( const iType & i ) const
-    { return Impl::dimension( m_offset_map , i ); }
-
-  //------------------------------------
-  // Destructor, constructors, assignment operators:
-
-  KOKKOS_INLINE_FUNCTION
-  ~View() {}
-
-  KOKKOS_INLINE_FUNCTION
-  View()
-    : m_ptr_on_device(0)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-  { m_offset_map.assing(0,0,0,0,0,0,0,0); }
-
-  KOKKOS_INLINE_FUNCTION
-  View( const View & rhs )
-    : m_ptr_on_device(0)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-  {
-    (void) Impl::ViewAssignment<
-      typename traits::specialize ,
-      typename traits::specialize >( *this , rhs );
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View & rhs )
-    {
-      (void) Impl::ViewAssignment<
-        typename traits::specialize ,
-        typename traits::specialize >( *this , rhs );
-      return *this ;
-    }
-
-  //------------------------------------
-  // Construct or assign compatible view:
-
-  template< class RT , class RL , class RD , class RM , class RS >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<RT,RL,RD,RM,RS> & rhs )
-    : m_ptr_on_device(0)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-  {
-    (void) Impl::ViewAssignment<
-      typename traits::specialize , RS >( *this , rhs );
-  }
-
-  template< class RT , class RL , class RD , class RM , class RS >
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
-    {
-      (void) Impl::ViewAssignment<
-        typename traits::specialize , RS >( *this , rhs );
-      return *this ;
-    }
-
-  //------------------------------------
-  // Allocation of a managed view with possible alignment padding.
-
-  template< class AllocationProperties >
-  explicit inline
-  View( const AllocationProperties & prop ,
-        const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 )
-    : m_ptr_on_device(0)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-  {
-    typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ;
-
-    typedef typename traits::memory_space  memory_space ;
-    typedef typename traits::value_type::value_type   scalar_type ;
-
-    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
-    m_offset_map.set_padding();
-
-    m_tracker = memory_space::allocate_and_track( Alloc::label( prop ), sizeof(scalar_type) * m_offset_map.capacity() );
-
-    m_ptr_on_device = reinterpret_cast<scalar_type *>(m_tracker.alloc_ptr());
-
-    (void) Impl::ViewDefaultConstruct< typename traits::execution_space , scalar_type , Alloc::Initialize >( m_ptr_on_device , m_offset_map.capacity() );
-  }
-
-  //------------------------------------
-  // Assign an unmanaged View from pointer, can be called in functors.
-  // No alignment padding is performed.
-
-  typedef Impl::if_c< ! traits::is_managed ,
-                      typename traits::value_type::value_type * ,
-                      Impl::ViewError::user_pointer_constructor_requires_unmanaged >
-    if_user_pointer_constructor ;
-
-  View( typename if_user_pointer_constructor::type ptr ,
-        const size_t n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 )
-    : m_ptr_on_device(0)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-  {
-    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
-    m_ptr_on_device = if_user_pointer_constructor::select( ptr );
-    m_management.set_unmanaged();
-  }
-
-  //------------------------------------
-  // Assign unmanaged View to portion of Device shared memory
-
-  typedef Impl::if_c< ! traits::is_managed ,
-                      typename traits::execution_space ,
-                      Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-      if_device_shmem_constructor ;
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( typename if_device_shmem_constructor::type & dev ,
-        const unsigned n0 = 0 ,
-        const unsigned n1 = 0 ,
-        const unsigned n2 = 0 ,
-        const unsigned n3 = 0 ,
-        const unsigned n4 = 0 ,
-        const unsigned n5 = 0 ,
-        const unsigned n6 = 0 ,
-        const unsigned n7 = 0 )
-    : m_ptr_on_device(0)
-    , m_offset_map()
-    , m_management()
-    , m_tracker()
-  {
-    typedef typename traits::value_type::value_type   scalar_type ;
-
-    enum { align = 8 };
-    enum { mask  = align - 1 };
-
-    typedef Impl::if_c< ! traits::is_managed ,
-                        scalar_type * ,
-                        Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-      if_device_shmem_pointer ;
-
-    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
-
-    // Select the first argument:
-    m_ptr_on_device = if_device_shmem_pointer::select(
-     (scalar_type *) dev.get_shmem( unsigned( sizeof(scalar_type) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
-  }
-
-  static inline
-  unsigned shmem_size( const unsigned n0 = 0 ,
-                       const unsigned n1 = 0 ,
-                       const unsigned n2 = 0 ,
-                       const unsigned n3 = 0 ,
-                       const unsigned n4 = 0 ,
-                       const unsigned n5 = 0 ,
-                       const unsigned n6 = 0 ,
-                       const unsigned n7 = 0 )
-  {
-    enum { align = 8 };
-    enum { mask  = align - 1 };
-
-    typedef typename traits::value_type::value_type   scalar_type ;
-
-    offset_map_type offset_map ;
-
-    offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
-
-    return unsigned( sizeof(scalar_type) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ;
-  }
-
-  //------------------------------------
-  // Is not allocated
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_null() const { return 0 == m_ptr_on_device ; }
-
-  //------------------------------------
-  // LayoutLeft, rank 2:
-
-  typedef Test::Array< typename traits::value_type::value_type ,
-                       traits::value_type::StaticLength ,
-                       Test::ArrayProxyStrided > LeftValue ;
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 );
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type
-    operator() ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 );
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 );
-    }
-
-  //------------------------------------
-  // LayoutRight, rank 2:
-
-  typedef Test::Array< typename traits::value_type::value_type ,
-                       traits::value_type::StaticLength ,
-                       Test::ArrayProxyContiguous > RightValue ;
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 );
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type
-    operator() ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 );
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 );
-    }
-
-  //------------------------------------
-  // Access to the underlying contiguous storage of this view specialization.
-  // These methods are specific to specialization of a view.
-
-  KOKKOS_INLINE_FUNCTION
-  typename traits::value_type::value_type * ptr_on_device() const { return m_ptr_on_device ; }
-
-  // Stride of physical storage, dimensioned to at least Rank
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void stride( iType * const s ) const
-    { m_offset_map.stride( s ); }
-
-  // Count of contiguously allocated data members including padding.
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type capacity() const
-    { return m_offset_map.capacity(); }
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Test {
-
-template< class DeviceType >
-int TestViewAggregate()
-{
-  typedef Kokkos::View< Test::Array<double,32> * , DeviceType > a32_type ;
-  typedef typename a32_type::array_type a32_base_type ;
-
-  typedef Kokkos::View< Test::Array<double> * , DeviceType > a0_type ;
-  typedef typename a0_type::array_type a0_base_type ;
-
-  a32_type      a32("a32",100);
-  a32_base_type a32_base ;
-
-  a0_type       a0("a0",100,32);
-  a0_base_type  a0_base ;
-
-  a32_base = a32 ;
-  a0_base = a0 ;
-
-
-  return 0 ;
-}
-
-}
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#else /* #if ! KOKKOS_USING_EXP_VIEW */
-
 #include <impl/KokkosExp_ViewArray.hpp>
 
 namespace Test {
@@ -762,8 +103,6 @@ void TestViewAggregate()
 
 }
 
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
diff --git a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
index b0b3747087..bd05cd347b 100644
--- a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
+++ b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
@@ -57,12 +57,10 @@ struct StaticArray {
   T value[N] ;
 
   KOKKOS_INLINE_FUNCTION
-  StaticArray()
-    { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = T(); }
+  StaticArray() = default;
 
   KOKKOS_INLINE_FUNCTION
-  StaticArray( const StaticArray & rhs )
-    { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i]; }
+  StaticArray( const StaticArray & rhs ) = default;
 
   KOKKOS_INLINE_FUNCTION
   operator T () { return value[0]; }
@@ -75,11 +73,7 @@ struct StaticArray {
     }
 
   KOKKOS_INLINE_FUNCTION
-  StaticArray & operator = ( const StaticArray & rhs )
-    {
-      for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i] ;
-      return *this ;
-    }
+  StaticArray & operator = ( const StaticArray & rhs ) = default;
 
   KOKKOS_INLINE_FUNCTION
   StaticArray operator * ( const StaticArray & rhs )
@@ -111,6 +105,8 @@ struct StaticArray {
     }
 };
 
+static_assert(std::is_trivial<StaticArray<int, 4>>::value, "Not trivial");
+
 template< typename T , class Space >
 struct DOT {
   typedef T      value_type ;
diff --git a/lib/kokkos/core/unit_test/TestAllocationTracker.cpp b/lib/kokkos/core/unit_test/TestAllocationTracker.cpp
deleted file mode 100644
index b3a7fe9803..0000000000
--- a/lib/kokkos/core/unit_test/TestAllocationTracker.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <gtest/gtest.h>
-
-#include <iostream>
-#include <vector>
-
-#include <Kokkos_Core.hpp>
-
-#include <impl/Kokkos_AllocationTracker.hpp>
-#include <impl/Kokkos_BasicAllocators.hpp>
-
-namespace Test {
-
-class alocation_tracker : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-    Kokkos::initialize();
-  }
-
-  static void TearDownTestCase()
-  {
-    Kokkos::finalize();
-  }
-};
-
-TEST_F( alocation_tracker, simple)
-{
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-  using namespace Kokkos::Impl;
-
-  {
-    AllocationTracker tracker;
-    EXPECT_FALSE( tracker.is_valid() );
-  }
-
-  // test ref count and label
-  {
-    int size = 100;
-    std::vector<AllocationTracker> trackers(size);
-
-    trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test");
-
-    for (int i=0; i<size; ++i) {
-      trackers[i] = trackers[0];
-    }
-
-    EXPECT_EQ(100u, trackers[0].ref_count());
-    EXPECT_EQ(std::string("Test"), std::string(trackers[0].label()));
-  }
-
-
-  // test circular list
-  {
-    int num_allocs = 3000;
-    unsigned ref_count = 100;
-
-    std::vector<AllocationTracker> trackers(num_allocs);
-
-    for (int i=0; i<num_allocs; ++i) {
-      trackers[i] = AllocationTracker( MallocAllocator(), 128, "Test");
-      std::vector<AllocationTracker> ref_trackers(ref_count);
-      for (unsigned j=0; j<ref_count; ++j) {
-        ref_trackers[j] = trackers[i];
-      }
-      EXPECT_EQ( ref_count + 1u, trackers[i].ref_count() );
-    }
-
-    for (int i=0; i<num_allocs; ++i) {
-      EXPECT_EQ( 1u, trackers[i].ref_count() );
-    }
-  }
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-}
-
-TEST_F( alocation_tracker, force_leaks)
-{
-// uncomment to force memory leaks
-#if 0
-  using namespace Kokkos::Impl;
-  Kokkos::kokkos_malloc("Forced Leak", 4096*10);
-  Kokkos::kokkos_malloc<Kokkos::HostSpace>("Forced Leak", 4096*10);
-#endif
-}
-
-TEST_F( alocation_tracker, disable_reference_counting)
-{
-
-#if ! KOKKOS_USING_EXP_VIEW
-
-  using namespace Kokkos::Impl;
-  // test ref count and label
-  {
-    int size = 100;
-    std::vector<AllocationTracker> trackers(size);
-
-    trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test");
-
-    for (int i=1; i<size; ++i) {
-      Kokkos::Impl::AllocationTracker::disable_tracking();
-      trackers[i] = trackers[0] ;
-      Kokkos::Impl::AllocationTracker::enable_tracking();
-    }
-
-    EXPECT_EQ(1u, trackers[0].ref_count());
-    EXPECT_EQ(std::string("Test"), std::string(trackers[0].label()));
-  }
-
-#endif /* #if ! KOKKOS_USING_EXP_VIEW */
-
-}
-
-} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp
index 7b3ab14c06..e948723574 100644
--- a/lib/kokkos/core/unit_test/TestAtomic.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomic.hpp
@@ -84,10 +84,9 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  volatile SuperScalar& operator = (const SuperScalar& src) volatile  {
+  void operator = (const SuperScalar& src) volatile  {
     for(int i=0; i<N; i++)
       val[i] = src.val[i];
-    return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -208,6 +207,10 @@ T AddLoopSerial(int loop) {
   return val;
 }
 
+//------------------------------------------------------
+//--------------atomic_compare_exchange-----------------
+//------------------------------------------------------
+
 template<class T,class DEVICE_TYPE>
 struct CASFunctor{
   typedef DEVICE_TYPE execution_space;
@@ -270,6 +273,10 @@ T CASLoopSerial(int loop) {
   return val;
 }
 
+//----------------------------------------------
+//--------------atomic_exchange-----------------
+//----------------------------------------------
+
 template<class T,class DEVICE_TYPE>
 struct ExchFunctor{
   typedef DEVICE_TYPE execution_space;
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
new file mode 100644
index 0000000000..aee4bda06c
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -0,0 +1,841 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomicOperations {
+
+//-----------------------------------------------
+//--------------zero_functor---------------------
+//-----------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    data() = 0;
+  }
+};
+
+//-----------------------------------------------
+//--------------init_functor---------------------
+//-----------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct InitFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  T init_value ;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    data() = init_value;
+  }
+
+  InitFunctor(T _init_value) : init_value(_init_value) {}
+};
+
+
+//---------------------------------------------------
+//--------------atomic_fetch_max---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct MaxFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    //Kokkos::atomic_fetch_max(&data(),(T)1);
+    Kokkos::atomic_fetch_max(&data(),(T)i1);
+  }
+  MaxFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T MaxAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct MaxFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T MaxAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = (i0 > i1 ? i0 : i1) ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool MaxAtomicTest(T i0, T i1)
+{
+  T res       = MaxAtomic<T,DeviceType>(i0,i1);
+  T resSerial = MaxAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = MaxAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_min---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct MinFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_min(&data(),(T)i1);
+  }
+  MinFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T MinAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct MinFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T MinAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = (i0 < i1 ? i0 : i1) ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool MinAtomicTest(T i0, T i1)
+{
+  T res       = MinAtomic<T,DeviceType>(i0,i1);
+  T resSerial = MinAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = MinAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_mul---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct MulFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_mul(&data(),(T)i1);
+  }
+  MulFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T MulAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct MulFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T MulAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0*i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool MulAtomicTest(T i0, T i1)
+{
+  T res       = MulAtomic<T,DeviceType>(i0,i1);
+  T resSerial = MulAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = MulAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_div---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct DivFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_div(&data(),(T)i1);
+  }
+  DivFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T DivAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct DivFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T DivAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0/i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool DivAtomicTest(T i0, T i1)
+{
+  T res       = DivAtomic<T,DeviceType>(i0,i1);
+  T resSerial = DivAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = DivAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_mod---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct ModFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_mod(&data(),(T)i1);
+  }
+  ModFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T ModAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct ModFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T ModAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0%i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool ModAtomicTest(T i0, T i1)
+{
+  T res       = ModAtomic<T,DeviceType>(i0,i1);
+  T resSerial = ModAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = ModAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_and---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AndFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_and(&data(),(T)i1);
+  }
+  AndFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T AndAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct AndFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T AndAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0&i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool AndAtomicTest(T i0, T i1)
+{
+  T res       = AndAtomic<T,DeviceType>(i0,i1);
+  T resSerial = AndAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = AndAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_or----------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct OrFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_or(&data(),(T)i1);
+  }
+  OrFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T OrAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct OrFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T OrAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0|i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool OrAtomicTest(T i0, T i1)
+{
+  T res       = OrAtomic<T,DeviceType>(i0,i1);
+  T resSerial = OrAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = OrAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_xor---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct XorFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_xor(&data(),(T)i1);
+  }
+  XorFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T XorAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct XorFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T XorAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0^i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool XorAtomicTest(T i0, T i1)
+{
+  T res       = XorAtomic<T,DeviceType>(i0,i1);
+  T resSerial = XorAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = XorAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_lshift---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct LShiftFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_lshift(&data(),(T)i1);
+  }
+  LShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T LShiftAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct LShiftFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T LShiftAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0<<i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool LShiftAtomicTest(T i0, T i1)
+{
+  T res       = LShiftAtomic<T,DeviceType>(i0,i1);
+  T resSerial = LShiftAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = LShiftAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+//---------------------------------------------------
+//--------------atomic_fetch_rshift---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct RShiftFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_rshift(&data(),(T)i1);
+  }
+  RShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+};
+
+template<class T, class execution_space >
+T RShiftAtomic(T i0 , T i1) {
+  struct InitFunctor<T,execution_space> f_init(i0);
+  typename InitFunctor<T,execution_space>::type data("Data");
+  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+  f_init.data = data;
+  Kokkos::parallel_for(1,f_init);
+  execution_space::fence();
+
+  struct RShiftFunctor<T,execution_space> f(i0,i1);
+  f.data = data;
+  Kokkos::parallel_for(1,f);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T RShiftAtomicCheck(T i0 , T i1) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  *data = i0>>i1 ;
+
+  T val = *data;
+  delete [] data;
+  return val;
+}
+
+template<class T,class DeviceType>
+bool RShiftAtomicTest(T i0, T i1)
+{
+  T res       = RShiftAtomic<T,DeviceType>(i0,i1);
+  T resSerial = RShiftAtomicCheck<T>(i0,i1);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = RShiftAtomicTest"
+              << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+  return passed ;
+}
+
+
+//---------------------------------------------------
+//--------------atomic_test_control------------------
+//---------------------------------------------------
+
+template<class T,class DeviceType>
+bool AtomicOperationsTestIntegralType( int i0 , int i1 , int test )
+{
+  switch (test) {
+    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 5: return ModAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 6: return AndAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 7: return OrAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 8: return XorAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 9: return LShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 10: return RShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+  }
+  return 0;
+}
+
+template<class T,class DeviceType>
+bool AtomicOperationsTestNonIntegralType( int i0 , int i1 , int test )
+{
+  switch (test) {
+    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+  }
+  return 0;
+}
+
+} // namespace
+
diff --git a/lib/kokkos/core/unit_test/TestCuda.cpp b/lib/kokkos/core/unit_test/TestCuda.cpp
index 3958c1a344..e615566252 100644
--- a/lib/kokkos/core/unit_test/TestCuda.cpp
+++ b/lib/kokkos/core/unit_test/TestCuda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -81,24 +81,31 @@
 #include <TestTaskPolicy.hpp>
 #include <TestPolicyConstruction.hpp>
 
+#include <TestMDRange.hpp>
+
 //----------------------------------------------------------------------------
 
 class cuda : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
+  static void SetUpTestCase();
+  static void TearDownTestCase();
+};
+
+void cuda::SetUpTestCase()
   {
     Kokkos::Cuda::print_configuration( std::cout );
     Kokkos::HostSpace::execution_space::initialize();
     Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
   }
-  static void TearDownTestCase()
+
+void cuda::TearDownTestCase()
   {
     Kokkos::Cuda::finalize();
     Kokkos::HostSpace::execution_space::finalize();
   }
-};
 
 //----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
 
 namespace Test {
 
@@ -116,6 +123,11 @@ void test_cuda_spaces_int_value( int * ptr )
   if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
 }
 
+TEST_F( cuda , md_range ) {
+  TestMDRange_2D< Kokkos::Cuda >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::Cuda >::test_for3(100,100,100);
+}
 
 TEST_F( cuda , compiler_macros )
 {
@@ -223,7 +235,6 @@ struct TestViewCudaTexture {
     }
 };
 
-
 TEST_F( cuda , impl_view_texture )
 {
   TestViewCudaTexture< Kokkos::CudaSpace >::run();
@@ -265,7 +276,6 @@ struct TestViewCudaAccessible {
     }
 };
 
-
 TEST_F( cuda , impl_view_accessible )
 {
   TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
@@ -276,338 +286,5 @@ TEST_F( cuda , impl_view_accessible )
   TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
   TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
 }
-/*
-//----------------------------------------------------------------------------
-
-TEST_F( cuda, view_impl )
-{
-  // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater
-
-  test_view_impl< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_api )
-{
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
-
-  TestViewAPI< double , Kokkos::Cuda >();
-  TestViewAPI< double , Kokkos::CudaUVMSpace >();
-
-#if 0
-  Kokkos::View<double, Kokkos::Cuda > x("x");
-  Kokkos::View<double[1], Kokkos::Cuda > y("y");
-  // *x = 10 ;
-  // x() = 10 ;
-  // y[0] = 10 ;
-  // y(0) = 10 ;
-#endif
-}
-
-
-TEST_F( cuda , view_nested_view )
-{
-  ::Test::view_nested_view< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_assign_strided ) {
-  TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_left_0 ) {
-  TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_1 ) {
-  TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_2 ) {
-  TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_3 ) {
-  TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_0 ) {
-  TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_1 ) {
-  TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_3 ) {
-  TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_1d_assign ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_2d_from_3d ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_2d_from_5d ) {
-  TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >();
-}
-
-
-TEST_F( cuda, range_tag )
-{
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
-}
-
-TEST_F( cuda, team_tag )
-{
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
-}
-
-TEST_F( cuda, reduce )
-{
-  TestReduce< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduce< double , Kokkos::Cuda >( 1000000 );
-  TestReduce< int , Kokkos::Cuda >( 0 );
-}
-
-TEST_F( cuda, reduce_team )
-{
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
-}
-
-TEST_F( cuda, shared_team )
-{
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
-}
-
-
-#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
-TEST_F( cuda, lambda_shared_team )
-{
-  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static>  >();
-  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic>  >();
-}
-#endif
-
-
-TEST_F( cuda, reduce_dynamic )
-{
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduceDynamic< double , Kokkos::Cuda >( 1000000 );
-}
-
-TEST_F( cuda, reduce_dynamic_view )
-{
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 );
-}
-
-TEST_F( cuda, atomic )
-{
-  const int loop_count = 1e3 ;
-
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) );
-
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda, tile_layout)
-{
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
-}
-
-TEST_F( cuda , view_aggregate )
-{
-  TestViewAggregate< Kokkos::Cuda >();
-  TestViewAggregateReduction< Kokkos::Cuda >();
-}
-
-
-TEST_F( cuda , scan )
-{
-  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
-  TestScan< Kokkos::Cuda >( 1000000 );
-  TestScan< Kokkos::Cuda >( 10000000 );
-
-  TestScan< Kokkos::Cuda >( 0 );
-  TestScan< Kokkos::Cuda >( 0 , 0 );
-
-  Kokkos::Cuda::fence();
-}
-
-TEST_F( cuda , team_scan )
-{
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
-}
-
-TEST_F( cuda , memory_pool )
-{
-  bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 );
-  ASSERT_TRUE( val_uvm );
-
-  Kokkos::Cuda::fence();
-
-  TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 );
-
-  Kokkos::Cuda::fence();
-}
-
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , template_meta_functions )
-{
-  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
-}
-
-//----------------------------------------------------------------------------
-
-namespace Test {
-
-TEST_F( cuda , reduction_deduction )
-{
-  TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
-}
-
-TEST_F( cuda , team_vector )
-{
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
-}
-*/
-}
-
-//----------------------------------------------------------------------------
-/*
-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
 
-TEST_F( cuda , task_policy )
-{
-  TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 );
-
-  for ( long i = 0 ; i < 15 ; ++i ) {
-      // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i);
-    TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096);
-  }
-  for ( long i = 0 ; i < 35 ; ++i ) {
-      // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i);
-    TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096);
-  }
 }
-
-TEST_F( cuda , task_team )
-{
-  TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000);
-}
-
-TEST_F( cuda , task_latch )
-{
-  TestTaskPolicy::test_latch< Kokkos::Cuda >(10);
-  TestTaskPolicy::test_latch< Kokkos::Cuda >(1000);
-}
-
-#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
-
diff --git a/lib/kokkos/core/unit_test/TestCuda_a.cpp b/lib/kokkos/core/unit_test/TestCuda_a.cpp
index 05716153d1..4680c33386 100644
--- a/lib/kokkos/core/unit_test/TestCuda_a.cpp
+++ b/lib/kokkos/core/unit_test/TestCuda_a.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -85,199 +85,13 @@
 
 class cuda : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-    Kokkos::Cuda::print_configuration( std::cout );
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-  }
-  static void TearDownTestCase()
-  {
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
+  static void SetUpTestCase();
+  static void TearDownTestCase();
 };
 
 //----------------------------------------------------------------------------
 
 namespace Test {
-/*
-__global__
-void test_abort()
-{
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
-    Kokkos::CudaSpace ,
-    Kokkos::HostSpace >::verify();
-}
-
-__global__
-void test_cuda_spaces_int_value( int * ptr )
-{
-  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
-}
-
-
-TEST_F( cuda , compiler_macros )
-{
-  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
-}
-
-TEST_F( cuda , memory_space )
-{
-  TestMemorySpace< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, uvm )
-{
-  if ( Kokkos::CudaUVMSpace::available() ) {
-
-    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int));
-
-    *uvm_ptr = 42 ;
-
-    Kokkos::Cuda::fence();
-    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
-    Kokkos::Cuda::fence();
-
-    EXPECT_EQ( *uvm_ptr, int(2*42) );
-
-    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , impl_shared_alloc )
-{
-  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
-}
-
-TEST_F( cuda, policy_construction) {
-  TestRangePolicyConstruction< Kokkos::Cuda >();
-  TestTeamPolicyConstruction< Kokkos::Cuda >();
-}
-
-TEST_F( cuda , impl_view_mapping )
-{
-  test_view_mapping< Kokkos::Cuda >();
-  test_view_mapping< Kokkos::CudaUVMSpace >();
-  test_view_mapping_subview< Kokkos::Cuda >();
-  test_view_mapping_subview< Kokkos::CudaUVMSpace >();
-  test_view_mapping_operator< Kokkos::Cuda >();
-  test_view_mapping_operator< Kokkos::CudaUVMSpace >();
-  TestViewMappingAtomic< Kokkos::Cuda >::run();
-}
-
-TEST_F( cuda , view_of_class )
-{
-  TestViewMappingClassValue< Kokkos::CudaSpace >::run();
-  TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run();
-}
-
-template< class MemSpace >
-struct TestViewCudaTexture {
-
-  enum { N = 1000 };
-
-  using V = Kokkos::Experimental::View<double*,MemSpace> ;
-  using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
-
-  V m_base ;
-  T m_tex ;
-
-  struct TagInit {};
-  struct TagTest {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_tex[i] != i + 1 ) ++error_count ; }
-
-  TestViewCudaTexture()
-    : m_base("base",N)
-    , m_tex( m_base )
-    {}
-
-  static void run()
-    {
-      EXPECT_TRUE( ( std::is_same< typename V::reference_type
-                                 , double &
-                                 >::value ) );
-
-      EXPECT_TRUE( ( std::is_same< typename T::reference_type
-                                 , const double
-                                 >::value ) );
-
-      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
-      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
-
-      TestViewCudaTexture self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
-
-
-TEST_F( cuda , impl_view_texture )
-{
-  TestViewCudaTexture< Kokkos::CudaSpace >::run();
-  TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
-}
-
-template< class MemSpace , class ExecSpace >
-struct TestViewCudaAccessible {
-
-  enum { N = 1000 };
-
-  using V = Kokkos::Experimental::View<double*,MemSpace> ;
-
-  V m_base ;
-
-  struct TagInit {};
-  struct TagTest {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_base[i] != i + 1 ) ++error_count ; }
-
-  TestViewCudaAccessible()
-    : m_base("base",N)
-    {}
-
-  static void run()
-    {
-      TestViewCudaAccessible self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
-      MemSpace::execution_space::fence();
-      // Next access is a different execution space, must complete prior kernel.
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
-
-
-TEST_F( cuda , impl_view_accessible )
-{
-  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
-
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
-
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
-}
-*/
-//----------------------------------------------------------------------------
 
 TEST_F( cuda, view_impl )
 {
@@ -304,7 +118,6 @@ TEST_F( cuda, view_api )
 #endif
 }
 
-
 TEST_F( cuda , view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Cuda >();
@@ -366,248 +179,4 @@ TEST_F( cuda, view_subview_2d_from_5d ) {
   TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >();
 }
 
-/*
-TEST_F( cuda, range_tag )
-{
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
-}
-
-TEST_F( cuda, team_tag )
-{
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
-}
-
-TEST_F( cuda, reduce )
-{
-  TestReduce< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduce< double , Kokkos::Cuda >( 1000000 );
-  TestReduce< int , Kokkos::Cuda >( 0 );
-}
-
-TEST_F( cuda, reduce_team )
-{
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
-
-TEST_F( cuda, shared_team )
-{
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
-}
-
-
-#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
-TEST_F( cuda, lambda_shared_team )
-{
-  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static>  >();
-  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic>  >();
-}
-#endif
-
-
-TEST_F( cuda, reduce_dynamic )
-{
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduceDynamic< double , Kokkos::Cuda >( 1000000 );
-}
-
-TEST_F( cuda, reduce_dynamic_view )
-{
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 );
-}
-
-TEST_F( cuda, atomic )
-{
-  const int loop_count = 1e3 ;
-
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) );
-
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda, tile_layout)
-{
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
-}
-
-TEST_F( cuda , view_aggregate )
-{
-  TestViewAggregate< Kokkos::Cuda >();
-  TestViewAggregateReduction< Kokkos::Cuda >();
-}
-
-
-TEST_F( cuda , scan )
-{
-  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
-  TestScan< Kokkos::Cuda >( 1000000 );
-  TestScan< Kokkos::Cuda >( 10000000 );
-
-  TestScan< Kokkos::Cuda >( 0 );
-  TestScan< Kokkos::Cuda >( 0 , 0 );
-
-  Kokkos::Cuda::fence();
-}
-
-TEST_F( cuda , team_scan )
-{
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
-}
-
-TEST_F( cuda , memory_pool )
-{
-  bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 );
-  ASSERT_TRUE( val_uvm );
-
-  Kokkos::Cuda::fence();
-
-  TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 );
-
-  Kokkos::Cuda::fence();
-}
-
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , template_meta_functions )
-{
-  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
-}
-
-//----------------------------------------------------------------------------
-
-namespace Test {
-
-TEST_F( cuda , reduction_deduction )
-{
-  TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
-}
-
-TEST_F( cuda , team_vector )
-{
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
-}
-*/
-}
-
-//----------------------------------------------------------------------------
-/*
-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
-
-TEST_F( cuda , task_policy )
-{
-  TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 );
-
-  for ( long i = 0 ; i < 15 ; ++i ) {
-      // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i);
-    TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096);
-  }
-  for ( long i = 0 ; i < 35 ; ++i ) {
-      // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i);
-    TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096);
-  }
-}
-
-TEST_F( cuda , task_team )
-{
-  TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000);
-}
-
-TEST_F( cuda , task_latch )
-{
-  TestTaskPolicy::test_latch< Kokkos::Cuda >(10);
-  TestTaskPolicy::test_latch< Kokkos::Cuda >(1000);
-}
-
-#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
-
diff --git a/lib/kokkos/core/unit_test/TestCuda_b.cpp b/lib/kokkos/core/unit_test/TestCuda_b.cpp
index 3d57347bb8..d4ca949e57 100644
--- a/lib/kokkos/core/unit_test/TestCuda_b.cpp
+++ b/lib/kokkos/core/unit_test/TestCuda_b.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -85,290 +85,22 @@
 
 class cuda : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-    Kokkos::Cuda::print_configuration( std::cout );
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-  }
-  static void TearDownTestCase()
-  {
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
+  static void SetUpTestCase();
+  static void TearDownTestCase();
 };
 
 //----------------------------------------------------------------------------
 
 namespace Test {
-/*
-__global__
-void test_abort()
-{
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
-    Kokkos::CudaSpace ,
-    Kokkos::HostSpace >::verify();
-}
-
-__global__
-void test_cuda_spaces_int_value( int * ptr )
-{
-  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
-}
-
-
-TEST_F( cuda , compiler_macros )
-{
-  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
-}
-
-TEST_F( cuda , memory_space )
-{
-  TestMemorySpace< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, uvm )
-{
-  if ( Kokkos::CudaUVMSpace::available() ) {
-
-    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int));
-
-    *uvm_ptr = 42 ;
-
-    Kokkos::Cuda::fence();
-    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
-    Kokkos::Cuda::fence();
-
-    EXPECT_EQ( *uvm_ptr, int(2*42) );
-
-    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , impl_shared_alloc )
-{
-  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
-}
-
-TEST_F( cuda, policy_construction) {
-  TestRangePolicyConstruction< Kokkos::Cuda >();
-  TestTeamPolicyConstruction< Kokkos::Cuda >();
-}
-
-TEST_F( cuda , impl_view_mapping )
-{
-  test_view_mapping< Kokkos::Cuda >();
-  test_view_mapping< Kokkos::CudaUVMSpace >();
-  test_view_mapping_subview< Kokkos::Cuda >();
-  test_view_mapping_subview< Kokkos::CudaUVMSpace >();
-  test_view_mapping_operator< Kokkos::Cuda >();
-  test_view_mapping_operator< Kokkos::CudaUVMSpace >();
-  TestViewMappingAtomic< Kokkos::Cuda >::run();
-}
-
-TEST_F( cuda , view_of_class )
-{
-  TestViewMappingClassValue< Kokkos::CudaSpace >::run();
-  TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run();
-}
-
-template< class MemSpace >
-struct TestViewCudaTexture {
-
-  enum { N = 1000 };
-
-  using V = Kokkos::Experimental::View<double*,MemSpace> ;
-  using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
-
-  V m_base ;
-  T m_tex ;
-
-  struct TagInit {};
-  struct TagTest {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_tex[i] != i + 1 ) ++error_count ; }
-
-  TestViewCudaTexture()
-    : m_base("base",N)
-    , m_tex( m_base )
-    {}
-
-  static void run()
-    {
-      EXPECT_TRUE( ( std::is_same< typename V::reference_type
-                                 , double &
-                                 >::value ) );
-
-      EXPECT_TRUE( ( std::is_same< typename T::reference_type
-                                 , const double
-                                 >::value ) );
-
-      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
-      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
-
-      TestViewCudaTexture self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
-
-
-TEST_F( cuda , impl_view_texture )
-{
-  TestViewCudaTexture< Kokkos::CudaSpace >::run();
-  TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
-}
-
-template< class MemSpace , class ExecSpace >
-struct TestViewCudaAccessible {
-
-  enum { N = 1000 };
-
-  using V = Kokkos::Experimental::View<double*,MemSpace> ;
-
-  V m_base ;
-
-  struct TagInit {};
-  struct TagTest {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_base[i] != i + 1 ) ++error_count ; }
-
-  TestViewCudaAccessible()
-    : m_base("base",N)
-    {}
-
-  static void run()
-    {
-      TestViewCudaAccessible self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
-      MemSpace::execution_space::fence();
-      // Next access is a different execution space, must complete prior kernel.
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
-
-
-TEST_F( cuda , impl_view_accessible )
-{
-  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
-
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
-
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda, view_impl )
-{
-  // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater
-
-  test_view_impl< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_api )
-{
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
-
-  TestViewAPI< double , Kokkos::Cuda >();
-  TestViewAPI< double , Kokkos::CudaUVMSpace >();
-
-#if 0
-  Kokkos::View<double, Kokkos::Cuda > x("x");
-  Kokkos::View<double[1], Kokkos::Cuda > y("y");
-  // *x = 10 ;
-  // x() = 10 ;
-  // y[0] = 10 ;
-  // y(0) = 10 ;
-#endif
-}
-
-
-TEST_F( cuda , view_nested_view )
-{
-  ::Test::view_nested_view< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_assign_strided ) {
-  TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_left_0 ) {
-  TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_1 ) {
-  TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_2 ) {
-  TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_3 ) {
-  TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_0 ) {
-  TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_1 ) {
-  TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_3 ) {
-  TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_1d_assign ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_2d_from_3d ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_2d_from_5d ) {
-  TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >();
-}
-*/
 
 TEST_F( cuda, range_tag )
 {
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
+  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
   TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
   TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
   TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
@@ -380,6 +112,10 @@ TEST_F( cuda, range_tag )
 
 TEST_F( cuda, team_tag )
 {
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
+  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
   TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
   TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
   TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
@@ -393,6 +129,14 @@ TEST_F( cuda, reduce )
   TestReduce< int , Kokkos::Cuda >( 0 );
 }
 
+TEST_F( cuda , reducers )
+{
+  TestReducers<int, Kokkos::Cuda>::execute_integer();
+  TestReducers<size_t, Kokkos::Cuda>::execute_integer();
+  TestReducers<double, Kokkos::Cuda>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::Cuda>::execute_basic();
+}
+
 TEST_F( cuda, reduce_team )
 {
   TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
@@ -411,7 +155,6 @@ TEST_F( cuda, shared_team )
   TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-
 #if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
 TEST_F( cuda, lambda_shared_team )
 {
@@ -424,6 +167,14 @@ TEST_F( cuda, lambda_shared_team )
 }
 #endif
 
+TEST_F( cuda, shmem_size) {
+  TestShmemSize< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, multi_level_scratch) {
+  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
 
 TEST_F( cuda, reduce_dynamic )
 {
@@ -436,178 +187,5 @@ TEST_F( cuda, reduce_dynamic_view )
   TestReduceDynamicView< long ,   Kokkos::Cuda >( 10000000 );
   TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 );
 }
-/*
-TEST_F( cuda, atomic )
-{
-  const int loop_count = 1e3 ;
-
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) );
-
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda, tile_layout)
-{
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
 
-  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
-}
-
-TEST_F( cuda , view_aggregate )
-{
-  TestViewAggregate< Kokkos::Cuda >();
-  TestViewAggregateReduction< Kokkos::Cuda >();
 }
-
-
-TEST_F( cuda , scan )
-{
-  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
-  TestScan< Kokkos::Cuda >( 1000000 );
-  TestScan< Kokkos::Cuda >( 10000000 );
-
-  TestScan< Kokkos::Cuda >( 0 );
-  TestScan< Kokkos::Cuda >( 0 , 0 );
-
-  Kokkos::Cuda::fence();
-}
-
-TEST_F( cuda , team_scan )
-{
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
-}
-
-TEST_F( cuda , memory_pool )
-{
-  bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 );
-  ASSERT_TRUE( val_uvm );
-
-  Kokkos::Cuda::fence();
-
-  TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 );
-
-  Kokkos::Cuda::fence();
-}
-
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , template_meta_functions )
-{
-  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
-}
-
-//----------------------------------------------------------------------------
-
-namespace Test {
-
-TEST_F( cuda , reduction_deduction )
-{
-  TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
-}
-
-TEST_F( cuda , team_vector )
-{
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
-}
-*/
-}
-
-//----------------------------------------------------------------------------
-/*
-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
-
-TEST_F( cuda , task_policy )
-{
-  TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 );
-
-  for ( long i = 0 ; i < 15 ; ++i ) {
-      // printf("TestTaskPolicy::test_fib< Kokkos::Cuda >(%d);\n",i);
-    TestTaskPolicy::test_fib< Kokkos::Cuda >(i,4096);
-  }
-  for ( long i = 0 ; i < 35 ; ++i ) {
-      // printf("TestTaskPolicy::test_fib2< Kokkos::Cuda >(%d);\n",i);
-    TestTaskPolicy::test_fib2< Kokkos::Cuda >(i,4096);
-  }
-}
-
-TEST_F( cuda , task_team )
-{
-  TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000);
-}
-
-TEST_F( cuda , task_latch )
-{
-  TestTaskPolicy::test_latch< Kokkos::Cuda >(10);
-  TestTaskPolicy::test_latch< Kokkos::Cuda >(1000);
-}
-
-#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
-
diff --git a/lib/kokkos/core/unit_test/TestCuda_c.cpp b/lib/kokkos/core/unit_test/TestCuda_c.cpp
index 24635959c5..70584cead1 100644
--- a/lib/kokkos/core/unit_test/TestCuda_c.cpp
+++ b/lib/kokkos/core/unit_test/TestCuda_c.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -60,6 +60,7 @@
 
 #include <TestViewImpl.hpp>
 #include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
 
 #include <TestViewAPI.hpp>
 #include <TestViewSubview.hpp>
@@ -85,358 +86,14 @@
 
 class cuda : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-    Kokkos::Cuda::print_configuration( std::cout );
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-  }
-  static void TearDownTestCase()
-  {
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
+  static void SetUpTestCase();
+  static void TearDownTestCase();
 };
 
 //----------------------------------------------------------------------------
 
 namespace Test {
-/*
-__global__
-void test_abort()
-{
-  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
-    Kokkos::CudaSpace ,
-    Kokkos::HostSpace >::verify();
-}
-
-__global__
-void test_cuda_spaces_int_value( int * ptr )
-{
-  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
-}
-
-
-TEST_F( cuda , compiler_macros )
-{
-  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
-}
-
-TEST_F( cuda , memory_space )
-{
-  TestMemorySpace< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, uvm )
-{
-  if ( Kokkos::CudaUVMSpace::available() ) {
-
-    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int));
-
-    *uvm_ptr = 42 ;
-
-    Kokkos::Cuda::fence();
-    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
-    Kokkos::Cuda::fence();
-
-    EXPECT_EQ( *uvm_ptr, int(2*42) );
-
-    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , impl_shared_alloc )
-{
-  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
-}
-
-TEST_F( cuda, policy_construction) {
-  TestRangePolicyConstruction< Kokkos::Cuda >();
-  TestTeamPolicyConstruction< Kokkos::Cuda >();
-}
-
-TEST_F( cuda , impl_view_mapping )
-{
-  test_view_mapping< Kokkos::Cuda >();
-  test_view_mapping< Kokkos::CudaUVMSpace >();
-  test_view_mapping_subview< Kokkos::Cuda >();
-  test_view_mapping_subview< Kokkos::CudaUVMSpace >();
-  test_view_mapping_operator< Kokkos::Cuda >();
-  test_view_mapping_operator< Kokkos::CudaUVMSpace >();
-  TestViewMappingAtomic< Kokkos::Cuda >::run();
-}
-
-TEST_F( cuda , view_of_class )
-{
-  TestViewMappingClassValue< Kokkos::CudaSpace >::run();
-  TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run();
-}
-
-template< class MemSpace >
-struct TestViewCudaTexture {
 
-  enum { N = 1000 };
-
-  using V = Kokkos::Experimental::View<double*,MemSpace> ;
-  using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
-
-  V m_base ;
-  T m_tex ;
-
-  struct TagInit {};
-  struct TagTest {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_tex[i] != i + 1 ) ++error_count ; }
-
-  TestViewCudaTexture()
-    : m_base("base",N)
-    , m_tex( m_base )
-    {}
-
-  static void run()
-    {
-      EXPECT_TRUE( ( std::is_same< typename V::reference_type
-                                 , double &
-                                 >::value ) );
-
-      EXPECT_TRUE( ( std::is_same< typename T::reference_type
-                                 , const double
-                                 >::value ) );
-
-      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
-      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
-
-      TestViewCudaTexture self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
-
-
-TEST_F( cuda , impl_view_texture )
-{
-  TestViewCudaTexture< Kokkos::CudaSpace >::run();
-  TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
-}
-
-template< class MemSpace , class ExecSpace >
-struct TestViewCudaAccessible {
-
-  enum { N = 1000 };
-
-  using V = Kokkos::Experimental::View<double*,MemSpace> ;
-
-  V m_base ;
-
-  struct TagInit {};
-  struct TagTest {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_base[i] != i + 1 ) ++error_count ; }
-
-  TestViewCudaAccessible()
-    : m_base("base",N)
-    {}
-
-  static void run()
-    {
-      TestViewCudaAccessible self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
-      MemSpace::execution_space::fence();
-      // Next access is a different execution space, must complete prior kernel.
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
-
-
-TEST_F( cuda , impl_view_accessible )
-{
-  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
-
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
-
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( cuda, view_impl )
-{
-  // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater
-
-  test_view_impl< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_api )
-{
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
-
-  TestViewAPI< double , Kokkos::Cuda >();
-  TestViewAPI< double , Kokkos::CudaUVMSpace >();
-
-#if 0
-  Kokkos::View<double, Kokkos::Cuda > x("x");
-  Kokkos::View<double[1], Kokkos::Cuda > y("y");
-  // *x = 10 ;
-  // x() = 10 ;
-  // y[0] = 10 ;
-  // y(0) = 10 ;
-#endif
-}
-
-
-TEST_F( cuda , view_nested_view )
-{
-  ::Test::view_nested_view< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_assign_strided ) {
-  TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
-}
-
-TEST_F( cuda, view_subview_left_0 ) {
-  TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_1 ) {
-  TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_2 ) {
-  TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_left_3 ) {
-  TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_0 ) {
-  TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_1 ) {
-  TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_right_3 ) {
-  TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_1d_assign ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_2d_from_3d ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >();
-}
-
-TEST_F( cuda, view_subview_2d_from_5d ) {
-  TestViewSubview::test_2d_subview_5d< Kokkos::CudaUVMSpace >();
-}
-
-
-TEST_F( cuda, range_tag )
-{
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  //TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
-}
-
-TEST_F( cuda, team_tag )
-{
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
-}
-
-TEST_F( cuda, reduce )
-{
-  TestReduce< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduce< double , Kokkos::Cuda >( 1000000 );
-  TestReduce< int , Kokkos::Cuda >( 0 );
-}
-
-TEST_F( cuda, reduce_team )
-{
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
-}
-
-TEST_F( cuda, shared_team )
-{
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
-}
-
-
-#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
-TEST_F( cuda, lambda_shared_team )
-{
-  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static>  >();
-  TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic>  >();
-}
-#endif
-
-
-TEST_F( cuda, reduce_dynamic )
-{
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduceDynamic< double , Kokkos::Cuda >( 1000000 );
-}
-
-TEST_F( cuda, reduce_dynamic_view )
-{
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 10000000 );
-  TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 );
-}
-*/
 TEST_F( cuda, atomic )
 {
   const int loop_count = 1e3 ;
@@ -479,6 +136,75 @@ TEST_F( cuda, atomic )
 
 }
 
+TEST_F( cuda , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 4 ) ) );
+  }
+
+}
+
 //----------------------------------------------------------------------------
 
 TEST_F( cuda, tile_layout)
@@ -512,7 +238,6 @@ TEST_F( cuda , view_aggregate )
   TestViewAggregateReduction< Kokkos::Cuda >();
 }
 
-
 TEST_F( cuda , scan )
 {
   TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
@@ -535,12 +260,19 @@ TEST_F( cuda , team_scan )
 
 TEST_F( cuda , memory_pool )
 {
-  bool val_uvm = TestMemoryPool::test_mempool< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 128000 );
-  ASSERT_TRUE( val_uvm );
+//  typedef Kokkos::CudaUVMSpace  device_type;
+  typedef Kokkos::Cuda          device_type;
+
+  bool val = TestMemoryPool::test_mempool< device_type >( 128, 128000000 );
+  ASSERT_TRUE( val );
 
   Kokkos::Cuda::fence();
 
-  TestMemoryPool::test_mempool2< Kokkos::Cuda, Kokkos::CudaUVMSpace >( 128, 2560000 );
+  TestMemoryPool::test_mempool2< device_type >( 64, 4, 100000, 200000 );
+
+  Kokkos::Cuda::fence();
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Cuda >();
 
   Kokkos::Cuda::fence();
 }
@@ -578,13 +310,43 @@ TEST_F( cuda , team_vector )
   ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
 }
 
+TEST_F( cuda, triple_nested_parallelism )
+{
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 32 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 16 , 16 );
+}
+
 }
 
 //----------------------------------------------------------------------------
 
-#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY )
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
 
-TEST_F( cuda , task_policy )
+TEST_F( cuda , task_fib )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestFib< Kokkos::Cuda >::run(i, (i+1)*1000000 );
+  }
+}
+
+TEST_F( cuda , task_depend )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestTaskDependence< Kokkos::Cuda >::run(i);
+  }
+}
+
+TEST_F( cuda , task_team )
+{
+  //TestTaskPolicy::TestTaskTeam< Kokkos::Cuda >::run(1000);
+  TestTaskPolicy::TestTaskTeam< Kokkos::Cuda >::run(104);
+  TestTaskPolicy::TestTaskTeamValue< Kokkos::Cuda >::run(1000);
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda , old_task_policy )
 {
   TestTaskPolicy::test_task_dep< Kokkos::Cuda >( 10 );
 
@@ -598,16 +360,16 @@ TEST_F( cuda , task_policy )
   }
 }
 
-TEST_F( cuda , task_team )
+TEST_F( cuda , old_task_team )
 {
   TestTaskPolicy::test_task_team< Kokkos::Cuda >(1000);
 }
 
-TEST_F( cuda , task_latch )
+TEST_F( cuda , old_task_latch )
 {
   TestTaskPolicy::test_latch< Kokkos::Cuda >(10);
   TestTaskPolicy::test_latch< Kokkos::Cuda >(1000);
 }
 
-#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */
+#endif // #if defined( KOKKOS_ENABLE_TASKPOLICY )
 
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
similarity index 93%
rename from lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp
rename to lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index a1e3f8fb0a..a17ed97a9f 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -257,11 +257,13 @@ protected:
   }
 };
 
-
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
 TEST_F( defaultdevicetypeinit, no_args) {
   Impl::test_no_arguments();
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
 TEST_F( defaultdevicetypeinit, commandline_args_empty) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -271,7 +273,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_empty) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
 TEST_F( defaultdevicetypeinit, commandline_args_other) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -281,7 +285,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_other) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
 TEST_F( defaultdevicetypeinit, commandline_args_nthreads) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -291,7 +297,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -301,7 +309,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -311,7 +321,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -321,7 +333,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
 TEST_F( defaultdevicetypeinit, commandline_args_numa_device) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -331,7 +345,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_numa_device) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
 TEST_F( defaultdevicetypeinit, commandline_args_device) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -341,7 +357,9 @@ TEST_F( defaultdevicetypeinit, commandline_args_device) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
 TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
   Kokkos::InitArguments argstruct;
   int nargs = 0;
@@ -351,38 +369,49 @@ TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
     delete [] args[i];
   delete [] args;
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
 TEST_F( defaultdevicetypeinit, initstruct_default) {
   Kokkos::InitArguments args;
   Impl::test_initstruct_args(args);
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
 TEST_F( defaultdevicetypeinit, initstruct_nthreads) {
   Kokkos::InitArguments args = Impl::init_initstruct(true,false,false);
   Impl::test_initstruct_args(args);
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
 TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) {
   Kokkos::InitArguments args = Impl::init_initstruct(true,true,false);
   Impl::test_initstruct_args(args);
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
 TEST_F( defaultdevicetypeinit, initstruct_device) {
   Kokkos::InitArguments args = Impl::init_initstruct(false,false,true);
   Impl::test_initstruct_args(args);
 }
+#endif
 
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
 TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) {
   Kokkos::InitArguments args = Impl::init_initstruct(true,false,true);
   Impl::test_initstruct_args(args);
 }
+#endif
 
-
+#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
 TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) {
   Kokkos::InitArguments args = Impl::init_initstruct(true,true,true);
   Impl::test_initstruct_args(args);
 }
-
+#endif
 
 
 } // namespace test
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp
new file mode 100644
index 0000000000..40a773b3b8
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_1.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp
new file mode 100644
index 0000000000..f12c4f62b2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_10.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp
new file mode 100644
index 0000000000..c7ffd7b94e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_11.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp
new file mode 100644
index 0000000000..24e2b15201
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_12.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp
new file mode 100644
index 0000000000..7968c13b66
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_13.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp
new file mode 100644
index 0000000000..ab0563c6dc
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_14.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp
new file mode 100644
index 0000000000..70a8ca1727
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_15.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp
new file mode 100644
index 0000000000..727c7a95eb
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_16.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp
new file mode 100644
index 0000000000..88fba34c50
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_2.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp
new file mode 100644
index 0000000000..b3562cc53d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_3.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp
new file mode 100644
index 0000000000..0d4983319c
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_4.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp
new file mode 100644
index 0000000000..026fb01f88
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_5.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp
new file mode 100644
index 0000000000..937a13160e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_6.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp
new file mode 100644
index 0000000000..992c854c1a
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_7.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp
new file mode 100644
index 0000000000..07a8b1cb7c
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_8.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp
new file mode 100644
index 0000000000..4d8c05be2d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit_9.cpp
@@ -0,0 +1,2 @@
+#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
+#include<TestDefaultDeviceTypeInit.hpp>
diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
similarity index 77%
rename from lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
rename to lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
index 70f0545b2c..c15f812233 100644
--- a/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-//
+// 
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-//
+// 
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-//
+// 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,9 +36,41 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
+// 
 // ************************************************************************
 //@HEADER
 */
 
-#include<impl/Kokkos_MemoryPool_Inline.hpp>
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__)
+//----------------------------------------------------------------------------
+
+#include <TestReduce.hpp>
+
+
+namespace Test {
+
+class defaultdevicetype : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    Kokkos::initialize();
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::finalize();
+  }
+};
+
+
+TEST_F( defaultdevicetype, reduce_instantiation) {
+  TestReduceCombinatoricalInstantiation<>::execute();
+}
+
+} // namespace test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
new file mode 100644
index 0000000000..9894d1ce69
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -0,0 +1,555 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+namespace {
+
+template <typename ExecSpace >
+struct TestMDRange_2D {
+
+  using DataType     = int ;
+  using ViewType     = typename Kokkos::View< DataType** ,  ExecSpace > ;
+  using HostViewType = typename ViewType::HostMirror ;
+
+  ViewType input_view ;
+
+  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view("input_view", N0, N1) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , const int j ) const
+  {
+    input_view(i,j) = 1;
+  }
+
+
+  static void test_for2( const int64_t N0, const int64_t N1 )
+  {
+
+    using namespace Kokkos::Experimental;
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> >;
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {3,3} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {7,7} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {16,16} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0}, {N0,N1}, {5,16} );
+      TestMDRange_2D functor(N0,N1);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          if ( h_view(i,j) != 1 ) {
+            ++counter;
+          }
+        }}
+      if ( counter != 0 )
+        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+  } //end test_for2
+}; //MDRange_2D
+
+template <typename ExecSpace >
+struct TestMDRange_3D {
+
+  using DataType = int ;
+  using ViewType     = typename Kokkos::View< DataType*** ,  ExecSpace > ;
+  using HostViewType = typename ViewType::HostMirror ;
+
+  ViewType input_view ;
+
+  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view("input_view", N0, N1, N2) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , const int j , const int k ) const
+  {
+    input_view(i,j,k) = 1;
+  }
+
+  static void test_for3( const int64_t N0, const int64_t N1, const int64_t N2 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Default>, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {3,5,7} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {8,8,8} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+    {
+      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+
+      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
+      TestMDRange_3D functor(N0,N1,N2);
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view , functor.input_view );
+
+      int counter = 0;
+      for ( int i=0; i<N0; ++i ) {
+        for ( int j=0; j<N1; ++j ) {
+          for ( int k=0; k<N2; ++k ) {
+          if ( h_view(i,j,k) != 1 ) {
+            ++counter;
+          }
+        }}}
+      if ( counter != 0 )
+        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
+      ASSERT_EQ( counter , 0 );
+    }
+
+  } //end test_for3
+};
+
+} /* namespace */
+} /* namespace Test */
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
index b49d91e6a8..cf650b0bc8 100644
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -48,37 +48,45 @@
 #include <stdio.h>
 #include <iostream>
 #include <cmath>
+#include <algorithm>
 
 #include <impl/Kokkos_Timer.hpp>
 
 //#define TESTMEMORYPOOL_PRINT
 //#define TESTMEMORYPOOL_PRINT_STATUS
 
+#ifdef KOKKOS_HAVE_CUDA
+#define STRIDE 32
+#else
+#define STRIDE 1
+#endif
+
 namespace TestMemoryPool {
 
 struct pointer_obj {
-  uint64_t * ptr;
+  uint64_t *  ptr;
+};
+
+struct pointer_obj2 {
+  void *  ptr;
+  size_t  size;
 };
 
-template < typename PointerView, typename MemorySpace >
+template < typename PointerView, typename Allocator >
 struct allocate_memory {
   typedef typename PointerView::execution_space  execution_space;
   typedef typename execution_space::size_type    size_type;
 
-  enum { STRIDE = 32 };
+  PointerView  m_pointers;
+  size_t       m_chunk_size;
+  Allocator    m_mempool;
 
-  PointerView m_pointers;
-  size_t m_num_ptrs;
-  size_t m_chunk_size;
-  MemorySpace m_space;
-
-  allocate_memory( PointerView & ptrs, size_t nptrs,
-                   size_t cs, MemorySpace & sp )
-    : m_pointers( ptrs ), m_num_ptrs( nptrs ),
-      m_chunk_size( cs ), m_space( sp )
+  allocate_memory( PointerView & ptrs, size_t num_ptrs,
+                   size_t cs, Allocator & m )
+    : m_pointers( ptrs ), m_chunk_size( cs ), m_mempool( m )
   {
     // Initialize the view with the out degree of each vertex.
-    Kokkos::parallel_for( m_num_ptrs * STRIDE , *this );
+    Kokkos::parallel_for( num_ptrs * STRIDE, *this );
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -86,26 +94,55 @@ struct allocate_memory {
   {
     if ( i % STRIDE == 0 ) {
       m_pointers[i / STRIDE].ptr =
-        static_cast< uint64_t * >( m_space.allocate( m_chunk_size ) );
+        static_cast< uint64_t * >( m_mempool.allocate( m_chunk_size ) );
     }
   }
 };
 
 template < typename PointerView >
-struct fill_memory {
+struct count_invalid_memory {
   typedef typename PointerView::execution_space  execution_space;
   typedef typename execution_space::size_type    size_type;
+  typedef uint64_t                               value_type;
+
+  PointerView  m_pointers;
+  uint64_t &   m_result;
+
+  count_invalid_memory( PointerView & ptrs, size_t num_ptrs, uint64_t & res )
+    : m_pointers( ptrs ), m_result( res )
+  {
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_reduce( num_ptrs * STRIDE, *this, m_result );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, volatile value_type const & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & r ) const
+  {
+    if ( i % STRIDE == 0 ) {
+      r += ( m_pointers[i / STRIDE].ptr == 0 );
+    }
+  }
+};
 
-  enum { STRIDE = 32 };
+template < typename PointerView >
+struct fill_memory {
+  typedef typename PointerView::execution_space  execution_space;
+  typedef typename execution_space::size_type    size_type;
 
   PointerView m_pointers;
-  size_t m_num_ptrs;
 
-  fill_memory( PointerView & ptrs, size_t nptrs )
-    : m_pointers( ptrs ), m_num_ptrs( nptrs )
+  fill_memory( PointerView & ptrs, size_t num_ptrs ) : m_pointers( ptrs )
   {
     // Initialize the view with the out degree of each vertex.
-    Kokkos::parallel_for( m_num_ptrs * STRIDE , *this );
+    Kokkos::parallel_for( num_ptrs * STRIDE, *this );
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -123,17 +160,14 @@ struct sum_memory {
   typedef typename execution_space::size_type    size_type;
   typedef uint64_t                               value_type;
 
-  enum { STRIDE = 32 };
+  PointerView  m_pointers;
+  uint64_t &   m_result;
 
-  PointerView m_pointers;
-  size_t m_num_ptrs;
-  uint64_t & result;
-
-  sum_memory( PointerView & ptrs, size_t nptrs, uint64_t & res )
-    : m_pointers( ptrs ), m_num_ptrs( nptrs ), result( res )
+  sum_memory( PointerView & ptrs, size_t num_ptrs, uint64_t & res )
+    : m_pointers( ptrs ), m_result( res )
   {
     // Initialize the view with the out degree of each vertex.
-    Kokkos::parallel_reduce( m_num_ptrs * STRIDE , *this, result );
+    Kokkos::parallel_reduce( num_ptrs * STRIDE, *this, m_result );
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -153,65 +187,70 @@ struct sum_memory {
   }
 };
 
-template < typename PointerView, typename MemorySpace >
+template < typename PointerView, typename Allocator >
 struct deallocate_memory {
   typedef typename PointerView::execution_space  execution_space;
   typedef typename execution_space::size_type    size_type;
 
-  enum { STRIDE = 32 };
+  PointerView  m_pointers;
+  size_t       m_chunk_size;
+  Allocator    m_mempool;
 
-  PointerView m_pointers;
-  size_t m_num_ptrs;
-  size_t m_chunk_size;
-  MemorySpace m_space;
-
-  deallocate_memory( PointerView & ptrs, size_t nptrs,
-                     size_t cs, MemorySpace & sp )
-    : m_pointers( ptrs ), m_num_ptrs( nptrs ), m_chunk_size( cs ), m_space( sp )
+  deallocate_memory( PointerView & ptrs, size_t num_ptrs,
+                     size_t cs, Allocator & m )
+    : m_pointers( ptrs ), m_chunk_size( cs ), m_mempool( m )
   {
     // Initialize the view with the out degree of each vertex.
-    Kokkos::parallel_for( m_num_ptrs * STRIDE , *this );
+    Kokkos::parallel_for( num_ptrs * STRIDE, *this );
   }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( size_type i ) const
   {
     if ( i % STRIDE == 0 ) {
-      m_space.deallocate( m_pointers[i / STRIDE].ptr, m_chunk_size );
+      m_mempool.deallocate( m_pointers[i / STRIDE].ptr, m_chunk_size );
     }
   }
 };
 
-template < typename ExecutionSpace, typename MemorySpace >
+template < typename WorkView, typename PointerView, typename ScalarView,
+           typename Allocator >
 struct allocate_deallocate_memory {
-  typedef ExecutionSpace                       execution_space;
+  typedef typename WorkView::execution_space   execution_space;
   typedef typename execution_space::size_type  size_type;
 
-  enum { STRIDE = 32 };
-
-  size_t m_num_max_chunks;
-  size_t m_max_chunk_size;
-  size_t m_min_chunk_size;
-  size_t m_chunk_spacing;
-  MemorySpace m_space;
+  WorkView     m_work;
+  PointerView  m_pointers;
+  ScalarView   m_ptrs_front;
+  ScalarView   m_ptrs_back;
+  Allocator    m_mempool;
 
-  allocate_deallocate_memory( size_t nmc, size_t max_cs,
-                              size_t min_cs, size_t cs, MemorySpace & sp )
-    : m_num_max_chunks( nmc ), m_max_chunk_size( max_cs ),
-      m_min_chunk_size( min_cs ), m_chunk_spacing( cs ), m_space( sp )
+  allocate_deallocate_memory( WorkView & w, size_t work_size, PointerView & p,
+                              ScalarView pf, ScalarView pb, Allocator & m )
+    : m_work( w ), m_pointers( p ), m_ptrs_front( pf ), m_ptrs_back( pb ),
+      m_mempool( m )
   {
-    Kokkos::parallel_for( m_num_max_chunks * STRIDE, *this );
+    // Initialize the view with the out degree of each vertex.
+    Kokkos::parallel_for( work_size * STRIDE, *this );
   }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( size_type i ) const
   {
     if ( i % STRIDE == 0 ) {
-      for ( size_t j = m_max_chunk_size; j >= m_min_chunk_size; j /= m_chunk_spacing ) {
-        for ( size_t k = 0; k < 10; ++k ) {
-          void * mem = m_space.allocate( j );
-          m_space.deallocate( mem, j );
-        }
+      unsigned my_work = m_work[i / STRIDE];
+
+      if ( ( my_work & 1 ) == 0 ) {
+        // Allocation.
+        size_t pos = Kokkos::atomic_fetch_add( &m_ptrs_back(), 1 );
+        size_t alloc_size = my_work >> 1;
+        m_pointers[pos].ptr = m_mempool.allocate( alloc_size );
+        m_pointers[pos].size = alloc_size;
+      }
+      else {
+        // Deallocation.
+        size_t pos = Kokkos::atomic_fetch_add( &m_ptrs_front(), 1 );
+        m_mempool.deallocate( m_pointers[pos].ptr, m_pointers[pos].size );
       }
     }
   }
@@ -255,12 +294,14 @@ void print_results( const std::string & text, unsigned long long width,
 // pool and breaking large chunks into smaller chunks to fulfill allocation
 // requests.  It verifies that MemoryPool(), allocate(), and deallocate() work
 // correctly.
-template < class ExecSpace, class MemorySpace = typename ExecSpace::memory_space >
+template < class Device >
 bool test_mempool( size_t chunk_size, size_t total_size )
 {
-  typedef Kokkos::View< pointer_obj *, ExecSpace >       pointer_view;
-  typedef Kokkos::Experimental::MemoryPool< MemorySpace , ExecSpace >
-   pool_memory_space;
+  typedef typename Device::execution_space                 execution_space;
+  typedef typename Device::memory_space                    memory_space;
+  typedef Device                                           device_type;
+  typedef Kokkos::View< pointer_obj *, device_type >       pointer_view;
+  typedef Kokkos::Experimental::MemoryPool< device_type >  pool_memory_space;
 
   uint64_t result;
   size_t num_chunks = total_size / chunk_size;
@@ -269,7 +310,8 @@ bool test_mempool( size_t chunk_size, size_t total_size )
   pointer_view pointers( "pointers", num_chunks );
 
 #ifdef TESTMEMORYPOOL_PRINT
-  std::cout << std::setw( SHIFTW ) << "chunk_size: " << std::setw( 12 )
+  std::cout << "*** test_mempool() ***" << std::endl
+            << std::setw( SHIFTW ) << "chunk_size: " << std::setw( 12 )
             << chunk_size << std::endl
             << std::setw( SHIFTW ) << "total_size: " << std::setw( 12 )
             << total_size << std::endl
@@ -277,46 +319,53 @@ bool test_mempool( size_t chunk_size, size_t total_size )
             << num_chunks << std::endl;
 
   double elapsed_time = 0;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 #endif
 
-  pool_memory_space m_space( MemorySpace(), chunk_size, total_size );
+  pool_memory_space mempool( memory_space(), total_size * 1.2, 20 );
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "initialize mempool: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
 #endif
   timer.reset();
 #endif
 
-  // Tests:
-  //   test for correct behvior when out of memory
-  //   test for correct behvior when interleaving allocate() and deallocate()
-
   {
     allocate_memory< pointer_view, pool_memory_space >
-      am( pointers, num_chunks, chunk_size, m_space );
+      am( pointers, num_chunks, chunk_size, mempool );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "allocate chunks: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
 #endif
   timer.reset();
 #endif
 
+  {
+    count_invalid_memory< pointer_view > sm( pointers, num_chunks, result );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "invalid chunks: ", 16, elapsed_time, result );
+  timer.reset();
+#endif
+
   {
     fill_memory< pointer_view > fm( pointers, num_chunks );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "fill chunks: ", elapsed_time );
   timer.reset();
@@ -326,10 +375,11 @@ bool test_mempool( size_t chunk_size, size_t total_size )
     sum_memory< pointer_view > sm( pointers, num_chunks, result );
   }
 
+  execution_space::fence();
+
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
   elapsed_time = timer.seconds();
-  print_results( "sum chunks: ", 10, elapsed_time, result );
+  print_results( "sum chunks: ", 16, elapsed_time, result );
 #endif
 
   if ( result != ( num_chunks * ( num_chunks - 1 ) ) / 2 ) {
@@ -343,40 +393,51 @@ bool test_mempool( size_t chunk_size, size_t total_size )
 
   {
     deallocate_memory< pointer_view, pool_memory_space >
-      dm( pointers, num_chunks, chunk_size, m_space );
+      dm( pointers, num_chunks, chunk_size, mempool );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "deallocate chunks: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
 #endif
   timer.reset();
 #endif
 
   {
     allocate_memory< pointer_view, pool_memory_space >
-      am( pointers, num_chunks, chunk_size, m_space );
+      am( pointers, num_chunks, chunk_size, mempool );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "allocate chunks: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
 #endif
   timer.reset();
 #endif
 
+  {
+    count_invalid_memory< pointer_view > sm( pointers, num_chunks, result );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "invalid chunks: ", 16, elapsed_time, result );
+  timer.reset();
+#endif
+
   {
     fill_memory< pointer_view > fm( pointers, num_chunks );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "fill chunks: ", elapsed_time );
   timer.reset();
@@ -386,10 +447,11 @@ bool test_mempool( size_t chunk_size, size_t total_size )
     sum_memory< pointer_view > sm( pointers, num_chunks, result );
   }
 
+  execution_space::fence();
+
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
   elapsed_time = timer.seconds();
-  print_results( "sum chunks: ", 10, elapsed_time, result );
+  print_results( "sum chunks: ", 16, elapsed_time, result );
 #endif
 
   if ( result != ( num_chunks * ( num_chunks - 1 ) ) / 2 ) {
@@ -403,78 +465,340 @@ bool test_mempool( size_t chunk_size, size_t total_size )
 
   {
     deallocate_memory< pointer_view, pool_memory_space >
-      dm( pointers, num_chunks, chunk_size, m_space );
+      dm( pointers, num_chunks, chunk_size, mempool );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "deallocate chunks: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
 #endif
 #endif
 
   return return_val;
 }
 
+template < typename T >
+T smallest_power2_ge( T val )
+{
+  // Find the most significant nonzero bit.
+  int first_nonzero_bit = Kokkos::Impl::bit_scan_reverse( val );
+
+  // If val is an integral power of 2, ceil( log2(val) ) is equal to the
+  // most significant nonzero bit.  Otherwise, you need to add 1.
+  int lg2_size = first_nonzero_bit +
+                 !Kokkos::Impl::is_integral_power_of_two( val );
+
+  return T(1) << T(lg2_size);
+}
+
 // This test makes allocation requests for multiple sizes and interleaves
 // allocation and deallocation.
-template < class ExecSpace, class MemorySpace = typename ExecSpace::memory_space >
-void test_mempool2( size_t chunk_size, size_t total_size )
+//
+// There are 3 phases.  The first phase does only allocations to build up a
+// working state for the allocator.  The second phase interleaves allocations
+// and deletions.  The third phase does only deallocations to undo all the
+// allocations from the first phase.  By building first to a working state,
+// allocations and deallocations can happen in any order for the second phase.
+// Each phase performs on multiple chunk sizes.
+template < class Device >
+void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
+                    size_t phase1_size, size_t phase2_size )
 {
-  typedef Kokkos::Experimental::MemoryPool< MemorySpace , ExecSpace >
-    pool_memory_space;
+#ifdef TESTMEMORYPOOL_PRINT
+  typedef typename Device::execution_space                 execution_space;
+#endif
+  typedef typename Device::memory_space                    memory_space;
+  typedef Device                                           device_type;
+  typedef Kokkos::View< unsigned *, device_type >          work_view;
+  typedef Kokkos::View< size_t, device_type >              scalar_view;
+  typedef Kokkos::View< pointer_obj2 *, device_type >      pointer_view;
+  typedef Kokkos::Experimental::MemoryPool< device_type >  pool_memory_space;
+
+  enum {
+    MIN_CHUNK_SIZE      = 64,
+    MIN_BASE_CHUNK_SIZE = MIN_CHUNK_SIZE / 2 + 1
+  };
+
+  // Make sure the base chunk size is at least MIN_BASE_CHUNK_SIZE bytes, so
+  // all the different chunk sizes translate to different block sizes for the
+  // allocator.
+  if ( base_chunk_size < MIN_BASE_CHUNK_SIZE ) {
+    base_chunk_size = MIN_BASE_CHUNK_SIZE;
+  }
+
+  // Get the smallest power of 2 >= the base chunk size.  The size must be
+  // >= MIN_CHUNK_SIZE, though.
+  unsigned ceil_base_chunk_size = smallest_power2_ge( base_chunk_size );
+  if ( ceil_base_chunk_size < MIN_CHUNK_SIZE ) {
+    ceil_base_chunk_size = MIN_CHUNK_SIZE;
+  }
+
+  // Make sure the phase 1 size is multiples of num_chunk_sizes.
+  phase1_size = ( ( phase1_size + num_chunk_sizes - 1 ) / num_chunk_sizes ) *
+                num_chunk_sizes;
+
+  // Make sure the phase 2 size is multiples of (2 * num_chunk_sizes).
+  phase2_size =
+    ( ( phase2_size + 2 * num_chunk_sizes - 1 ) / ( 2 * num_chunk_sizes ) ) *
+    2 * num_chunk_sizes;
+
+  // The phase2 size must be <= twice the phase1 size so that deallocations
+  // can't happen before allocations.
+  if ( phase2_size > 2 * phase1_size ) phase2_size = 2 * phase1_size;
+
+  size_t phase3_size = phase1_size;
+  size_t half_phase2_size = phase2_size / 2;
+
+  // Each entry in the work views has the following format.  The least
+  // significant bit indicates allocation (0) vs. deallocation (1).  For
+  // allocation, the other bits indicate the desired allocation size.
+
+  // Initialize the phase 1 work view with an equal number of allocations for
+  // each chunk size.
+  work_view phase1_work( "Phase 1 Work", phase1_size );
+  typename work_view::HostMirror host_phase1_work =
+    create_mirror_view(phase1_work);
+
+  size_t inner_size = phase1_size / num_chunk_sizes;
+  unsigned chunk_size = base_chunk_size;
+
+  for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
+    for ( size_t j = 0; j < inner_size; ++j ) {
+      host_phase1_work[i * inner_size + j] = chunk_size << 1;
+    }
+
+    chunk_size *= 2;
+  }
+
+  std::random_shuffle( host_phase1_work.ptr_on_device(),
+                       host_phase1_work.ptr_on_device() + phase1_size );
+
+  deep_copy( phase1_work, host_phase1_work );
+
+  // Initialize the phase 2 work view with half allocations and half
+  // deallocations with an equal number of allocations for each chunk size.
+  work_view phase2_work( "Phase 2 Work", phase2_size );
+  typename work_view::HostMirror host_phase2_work =
+    create_mirror_view(phase2_work);
 
-  size_t num_chunk_sizes = 4;
-  size_t chunk_spacing = 4;
+  inner_size = half_phase2_size / num_chunk_sizes;
+  chunk_size = base_chunk_size;
+
+  for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
+    for ( size_t j = 0; j < inner_size; ++j ) {
+      host_phase2_work[i * inner_size + j] = chunk_size << 1;
+    }
+
+    chunk_size *= 2;
+  }
+
+  for ( size_t i = half_phase2_size; i < phase2_size; ++i ) {
+    host_phase2_work[i] = 1;
+  }
+
+  std::random_shuffle( host_phase2_work.ptr_on_device(),
+                       host_phase2_work.ptr_on_device() + phase2_size );
+
+  deep_copy( phase2_work, host_phase2_work );
+
+  // Initialize the phase 3 work view with all deallocations.
+  work_view phase3_work( "Phase 3 Work", phase3_size );
+  typename work_view::HostMirror host_phase3_work =
+    create_mirror_view(phase3_work);
+
+  inner_size = phase3_size / num_chunk_sizes;
+
+  for ( size_t i = 0; i < phase3_size; ++i ) host_phase3_work[i] = 1;
+
+  deep_copy( phase3_work, host_phase3_work );
+
+  // Calculate the amount of memory needed for the allocator.  We need to know
+  // the number of superblocks required for each chunk size and use that to
+  // calculate the amount of memory for each chunk size.
+  size_t lg_sb_size = 18;
+  size_t sb_size = 1 << lg_sb_size;
+  size_t total_size = 0;
+  size_t allocs_per_size = phase1_size / num_chunk_sizes +
+                           half_phase2_size / num_chunk_sizes;
+
+  chunk_size = ceil_base_chunk_size;
+  for ( size_t i = 0; i < num_chunk_sizes; ++i ) {
+    size_t my_size = allocs_per_size * chunk_size;
+    total_size += ( my_size + sb_size - 1 ) / sb_size * sb_size;
+    chunk_size *= 2;
+  }
+
+  // Declare the queue to hold the records for allocated memory.  An allocation
+  // adds a record to the back of the queue, and a deallocation removes a
+  // record from the front of the queue.
+  size_t num_allocations = phase1_size + half_phase2_size;
+  scalar_view ptrs_front( "Pointers front" );
+  scalar_view ptrs_back( "Pointers back" );
+
+  pointer_view pointers( "pointers", num_allocations );
 
 #ifdef TESTMEMORYPOOL_PRINT
+  printf( "\n*** test_mempool2() ***\n" );
+  printf( "       num_chunk_sizes: %12zu\n", num_chunk_sizes );
+  printf( "       base_chunk_size: %12u\n", base_chunk_size );
+  printf( "  ceil_base_chunk_size: %12u\n", ceil_base_chunk_size );
+  printf( "           phase1_size: %12zu\n", phase1_size );
+  printf( "           phase2_size: %12zu\n", phase2_size );
+  printf( "           phase3_size: %12zu\n", phase3_size );
+  printf( "       allocs_per_size: %12zu\n", allocs_per_size );
+  printf( "       num_allocations: %12zu\n", num_allocations );
+  printf( "            total_size: %12zu\n", total_size );
+  fflush( stdout );
+
   double elapsed_time = 0;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 #endif
 
-  pool_memory_space m_space( MemorySpace(), chunk_size, total_size,
-                             num_chunk_sizes, chunk_spacing );
+  pool_memory_space mempool( memory_space(), total_size * 1.2, lg_sb_size );
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
   print_results( "initialize mempool: ", elapsed_time );
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+
+  timer.reset();
+#endif
+
+  {
+    allocate_deallocate_memory< work_view, pointer_view, scalar_view,
+                                pool_memory_space >
+      adm( phase1_work, phase1_size, pointers, ptrs_front, ptrs_back, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "phase1: ", elapsed_time );
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+
+  timer.reset();
+#endif
+
+  {
+    allocate_deallocate_memory< work_view, pointer_view, scalar_view,
+                                pool_memory_space >
+      adm( phase2_work, phase2_size, pointers, ptrs_front, ptrs_back, mempool );
+  }
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "phase2: ", elapsed_time );
+
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
 #endif
 
-  chunk_size = m_space.get_min_chunk_size();
-  total_size = m_space.get_mem_size();
+  timer.reset();
+#endif
 
-  // Get the chunk size for the largest possible chunk.
-  //   max_chunk_size =
-  //     chunk_size * (MEMPOOL_CHUNK_SPACING ^ (MEMPOOL_NUM_CHUNK_SIZES - 1))
-  size_t max_chunk_size = chunk_size;
-  for (size_t i = 1; i < num_chunk_sizes; ++i) {
-    max_chunk_size *= chunk_spacing;
+  {
+    allocate_deallocate_memory< work_view, pointer_view, scalar_view,
+                                pool_memory_space >
+      adm( phase3_work, phase3_size, pointers, ptrs_front, ptrs_back, mempool );
   }
 
-  size_t num_max_chunks = total_size / ( max_chunk_size * num_chunk_sizes );
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "phase3: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
+#endif
+}
+
+// Tests for correct behavior when the allocator is out of memory.
+template < class Device >
+void test_memory_exhaustion()
+{
+#ifdef TESTMEMORYPOOL_PRINT
+  typedef typename Device::execution_space                 execution_space;
+#endif
+  typedef typename Device::memory_space                    memory_space;
+  typedef Device                                           device_type;
+  typedef Kokkos::View< pointer_obj *, device_type >       pointer_view;
+  typedef Kokkos::Experimental::MemoryPool< device_type >  pool_memory_space;
+
+  // The allocator will have a single superblock, and allocations will all be
+  // of the same chunk size.  The allocation loop will attempt to allocate
+  // twice the number of chunks as are available in the allocator.  The
+  // deallocation loop will only free the successfully allocated chunks.
+
+  size_t chunk_size = 128;
+  size_t num_chunks = 128;
+  size_t half_num_chunks = num_chunks / 2;
+  size_t superblock_size = chunk_size * half_num_chunks;
+  size_t lg_superblock_size =
+    Kokkos::Impl::integral_power_of_two( superblock_size );
 
+  pointer_view pointers( "pointers", num_chunks );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  std::cout << "\n*** test_memory_exhaustion() ***" << std::endl;
+
+  double elapsed_time = 0;
+  Kokkos::Timer timer;
+#endif
+
+  pool_memory_space mempool( memory_space(), superblock_size,
+                             lg_superblock_size );
+
+#ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "initialize mempool: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
+#endif
+  timer.reset();
 #endif
 
+  {
+    allocate_memory< pointer_view, pool_memory_space >
+      am( pointers, num_chunks, chunk_size, mempool );
+  }
+
 #ifdef TESTMEMORYPOOL_PRINT
+  execution_space::fence();
+  elapsed_time = timer.seconds();
+  print_results( "allocate chunks: ", elapsed_time );
+#ifdef TESTMEMORYPOOL_PRINT_STATUS
+  mempool.print_status();
+#endif
   timer.reset();
 #endif
 
   {
-    allocate_deallocate_memory< ExecSpace, pool_memory_space >
-      am( num_max_chunks, max_chunk_size, chunk_size, chunk_spacing, m_space );
+    // In parallel, the allocations that succeeded were not put contiguously
+    // into the pointers View.  The whole View can still be looped over and
+    // have deallocate called because deallocate will just do nothing for NULL
+    // pointers.
+    deallocate_memory< pointer_view, pool_memory_space >
+      dm( pointers, num_chunks, chunk_size, mempool );
   }
 
 #ifdef TESTMEMORYPOOL_PRINT
-  ExecSpace::fence();
+  execution_space::fence();
   elapsed_time = timer.seconds();
-  print_results( "allocate / deallocate: ", elapsed_time );
+  print_results( "deallocate chunks: ", elapsed_time );
 #ifdef TESTMEMORYPOOL_PRINT_STATUS
-  m_space.print_status();
+  mempool.print_status();
 #endif
 #endif
 }
@@ -489,4 +813,8 @@ void test_mempool2( size_t chunk_size, size_t total_size )
 #undef TESTMEMORYPOOL_PRINT_STATUS
 #endif
 
+#ifdef STRIDE
+#undef STRIDE
+#endif
+
 #endif
diff --git a/lib/kokkos/core/unit_test/TestOpenMP.cpp b/lib/kokkos/core/unit_test/TestOpenMP.cpp
index 35bc7c9869..6e8fc45179 100644
--- a/lib/kokkos/core/unit_test/TestOpenMP.cpp
+++ b/lib/kokkos/core/unit_test/TestOpenMP.cpp
@@ -55,6 +55,7 @@
 
 #include <TestViewImpl.hpp>
 #include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
 
 #include <TestViewAPI.hpp>
 #include <TestViewSubview.hpp>
@@ -81,6 +82,7 @@
 
 #include <TestPolicyConstruction.hpp>
 
+#include <TestMDRange.hpp>
 
 namespace Test {
 
@@ -97,6 +99,7 @@ protected:
 
     Kokkos::OpenMP::initialize( threads_count );
     Kokkos::OpenMP::print_configuration( std::cout , true );
+    srand(10231);
   }
 
   static void TearDownTestCase()
@@ -110,6 +113,12 @@ protected:
 };
 
 
+TEST_F( openmp , md_range ) {
+  TestMDRange_2D< Kokkos::OpenMP >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::OpenMP >::test_for3(100,100,100);
+}
+
 TEST_F( openmp , impl_shared_alloc ) {
   test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >();
 }
@@ -180,5 +189,74 @@ TEST_F( openmp , atomics )
   ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,3) ) );
 }
 
+TEST_F( openmp , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+  }
+
+}
+
 } // namespace test
 
diff --git a/lib/kokkos/core/unit_test/TestOpenMP_a.cpp b/lib/kokkos/core/unit_test/TestOpenMP_a.cpp
index 919eea7c80..64eac66804 100644
--- a/lib/kokkos/core/unit_test/TestOpenMP_a.cpp
+++ b/lib/kokkos/core/unit_test/TestOpenMP_a.cpp
@@ -86,27 +86,8 @@ namespace Test {
 
 class openmp : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-    const unsigned threads_count = std::max( 1u , numa_count ) *
-                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
-
-    Kokkos::OpenMP::initialize( threads_count );
-    Kokkos::OpenMP::print_configuration( std::cout , true );
-  }
-
-  static void TearDownTestCase()
-  {
-    Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads(1);
-
-    ASSERT_EQ( 1 , omp_get_max_threads() );
-  }
+  static void SetUpTestCase();
+  static void TearDownTestCase();
 };
 
 TEST_F( openmp, view_subview_auto_1d_left ) {
diff --git a/lib/kokkos/core/unit_test/TestOpenMP_b.cpp b/lib/kokkos/core/unit_test/TestOpenMP_b.cpp
index f024e22422..6cc2476014 100644
--- a/lib/kokkos/core/unit_test/TestOpenMP_b.cpp
+++ b/lib/kokkos/core/unit_test/TestOpenMP_b.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -86,27 +86,8 @@ namespace Test {
 
 class openmp : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-    const unsigned threads_count = std::max( 1u , numa_count ) *
-                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
-
-    Kokkos::OpenMP::initialize( threads_count );
-    Kokkos::OpenMP::print_configuration( std::cout , true );
-  }
-
-  static void TearDownTestCase()
-  {
-    Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads(1);
-
-    ASSERT_EQ( 1 , omp_get_max_threads() );
-  }
+  static void SetUpTestCase();
+  static void TearDownTestCase();
 };
 
 TEST_F( openmp , range_tag )
@@ -122,6 +103,10 @@ TEST_F( openmp , range_tag )
 
 TEST_F( openmp , team_tag )
 {
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
+  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
   TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
   TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
   TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
@@ -148,6 +133,14 @@ TEST_F( openmp, long_reduce_dynamic_view ) {
   TestReduceDynamicView< long ,   Kokkos::OpenMP >( 1000000 );
 }
 
+TEST_F( openmp , reducers )
+{
+  TestReducers<int, Kokkos::OpenMP>::execute_integer();
+  TestReducers<size_t, Kokkos::OpenMP>::execute_integer();
+  TestReducers<double, Kokkos::OpenMP>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::OpenMP>::execute_basic();
+}
+
 TEST_F( openmp, team_long_reduce) {
   TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
   TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
@@ -172,12 +165,21 @@ TEST_F( openmp, team_scratch_request) {
   TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) 
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
 TEST_F( openmp, team_lambda_shared_request) {
   TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
+TEST_F( openmp, shmem_size) {
+  TestShmemSize< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, multi_level_scratch) {
+  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+}
+
 } // namespace test
 
diff --git a/lib/kokkos/core/unit_test/TestOpenMP_c.cpp b/lib/kokkos/core/unit_test/TestOpenMP_c.cpp
index d9ed87878a..f0cdabe913 100644
--- a/lib/kokkos/core/unit_test/TestOpenMP_c.cpp
+++ b/lib/kokkos/core/unit_test/TestOpenMP_c.cpp
@@ -71,6 +71,7 @@
 #include <TestAggregateReduction.hpp>
 #include <TestCompilerMacros.hpp>
 #include <TestMemoryPool.hpp>
+#include <TestTaskPolicy.hpp>
 
 
 #include <TestCXX11.hpp>
@@ -86,27 +87,8 @@ namespace Test {
 
 class openmp : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-    const unsigned threads_count = std::max( 1u , numa_count ) *
-                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
-
-    Kokkos::OpenMP::initialize( threads_count );
-    Kokkos::OpenMP::print_configuration( std::cout , true );
-  }
-
-  static void TearDownTestCase()
-  {
-    Kokkos::OpenMP::finalize();
-
-    omp_set_num_threads(1);
-
-    ASSERT_EQ( 1 , omp_get_max_threads() );
-  }
+  static void SetUpTestCase();
+  static void TearDownTestCase();
 };
 
 TEST_F( openmp , view_remap )
@@ -197,7 +179,9 @@ TEST_F( openmp , memory_pool )
   bool val = TestMemoryPool::test_mempool< Kokkos::OpenMP >( 128, 128000000 );
   ASSERT_TRUE( val );
 
-  TestMemoryPool::test_mempool2< Kokkos::OpenMP >( 128, 128000000 );
+  TestMemoryPool::test_mempool2< Kokkos::OpenMP >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::OpenMP >();
 }
 
 //----------------------------------------------------------------------------
@@ -240,5 +224,39 @@ TEST_F( openmp , team_vector )
   ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) );
   ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) );
 }
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+TEST_F( openmp , task_fib )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestFib< Kokkos::OpenMP >::run(i, (i+1)*1000000 );
+  }
+}
+
+TEST_F( openmp , task_depend )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestTaskDependence< Kokkos::OpenMP >::run(i);
+  }
+}
+
+TEST_F( openmp , task_team )
+{
+  TestTaskPolicy::TestTaskTeam< Kokkos::OpenMP >::run(1000);
+  //TestTaskPolicy::TestTaskTeamValue< Kokkos::OpenMP >::run(1000); //TODO put back after testing
+}
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
 } // namespace test
 
+
+
+
+
+
diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 5aac8332fc..049138eb07 100644
--- a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -421,68 +421,68 @@ private:
     ASSERT_EQ  (p1.league_size() , league_size);
     ASSERT_EQ  (p1.team_size()   , team_size);
     ASSERT_TRUE(p1.chunk_size()  > 0);
-    ASSERT_EQ  (p1.scratch_size(), 0);
+    ASSERT_EQ  (p1.scratch_size(0), 0);
 
     policy_t p2 = p1.set_chunk_size(chunk_size);
     ASSERT_EQ  (p1.league_size() , league_size);
     ASSERT_EQ  (p1.team_size()   , team_size);
     ASSERT_TRUE(p1.chunk_size()  > 0);
-    ASSERT_EQ  (p1.scratch_size(), 0);
+    ASSERT_EQ  (p1.scratch_size(0), 0);
 
     ASSERT_EQ  (p2.league_size() , league_size);
     ASSERT_EQ  (p2.team_size()   , team_size);
     ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(), 0);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
 
     policy_t p3 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
     ASSERT_EQ  (p2.league_size() , league_size);
     ASSERT_EQ  (p2.team_size()   , team_size);
     ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(), 0);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
     ASSERT_EQ  (p3.league_size() , league_size);
     ASSERT_EQ  (p3.team_size()   , team_size);
     ASSERT_EQ  (p3.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p3.scratch_size(), per_team_scratch);
+    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
 
     policy_t p4 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch));
     ASSERT_EQ  (p2.league_size() , league_size);
     ASSERT_EQ  (p2.team_size()   , team_size);
     ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(), 0);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
     ASSERT_EQ  (p4.league_size() , league_size);
     ASSERT_EQ  (p4.team_size()   , team_size);
     ASSERT_EQ  (p4.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p4.scratch_size(), per_thread_scratch*team_size);
+    ASSERT_EQ  (p4.scratch_size(0), per_thread_scratch*team_size);
 
     policy_t p5 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch),Kokkos::PerTeam(per_team_scratch));
     ASSERT_EQ  (p2.league_size() , league_size);
     ASSERT_EQ  (p2.team_size()   , team_size);
     ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(), 0);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
     ASSERT_EQ  (p5.league_size() , league_size);
     ASSERT_EQ  (p5.team_size()   , team_size);
     ASSERT_EQ  (p5.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p5.scratch_size(), scratch_size);
+    ASSERT_EQ  (p5.scratch_size(0), scratch_size);
 
     policy_t p6 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
     ASSERT_EQ  (p2.league_size() , league_size);
     ASSERT_EQ  (p2.team_size()   , team_size);
     ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(), 0);
+    ASSERT_EQ  (p2.scratch_size(0), 0);
     ASSERT_EQ  (p6.league_size() , league_size);
     ASSERT_EQ  (p6.team_size()   , team_size);
     ASSERT_EQ  (p6.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p6.scratch_size(), scratch_size);
+    ASSERT_EQ  (p6.scratch_size(0), scratch_size);
 
     policy_t p7 = p3.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
     ASSERT_EQ  (p3.league_size() , league_size);
     ASSERT_EQ  (p3.team_size()   , team_size);
     ASSERT_EQ  (p3.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p3.scratch_size(), per_team_scratch);
+    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
     ASSERT_EQ  (p7.league_size() , league_size);
     ASSERT_EQ  (p7.team_size()   , team_size);
     ASSERT_EQ  (p7.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p7.scratch_size(), scratch_size);
+    ASSERT_EQ  (p7.scratch_size(0), scratch_size);
 }
   void test_run_time_parameters() {
     test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace> >();
diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp
index a8f2acea1d..431b844c9f 100644
--- a/lib/kokkos/core/unit_test/TestQthread.cpp
+++ b/lib/kokkos/core/unit_test/TestQthread.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -249,6 +249,10 @@ TEST_F( qthread, team_shared ) {
   TestSharedTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >();
 }
 
+TEST_F( qthread, shmem_size) {
+  TestShmemSize< Kokkos::Qthread >();
+}
+
 TEST_F( qthread , team_scan )
 {
   TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10 );
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
index c7fb7e9004..be8b4f90a3 100644
--- a/lib/kokkos/core/unit_test/TestRange.hpp
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -185,7 +185,7 @@ struct TestRange {
       },error);
       ASSERT_EQ(error,0);
 
-      if(ExecSpace::concurrency()>1) {
+      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<const size_t>(4*ExecSpace::concurrency())) ) {
         size_t min = N;
         size_t max = 0;
         for(int t=0; t<ExecSpace::concurrency(); t++) {
@@ -196,6 +196,7 @@ struct TestRange {
         //if(ExecSpace::concurrency()>2)
         //  ASSERT_TRUE(2*min<max);
       }
+      
     }
 
     {
@@ -218,7 +219,7 @@ struct TestRange {
       },error);
       ASSERT_EQ(error,0);
 
-      if(ExecSpace::concurrency()>1) {
+      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<const size_t>(4*ExecSpace::concurrency())) ) {
         size_t min = N;
         size_t max = 0;
         for(int t=0; t<ExecSpace::concurrency(); t++) {
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
index f5ce0e4dd2..53fc393bcc 100644
--- a/lib/kokkos/core/unit_test/TestReduce.hpp
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -457,7 +457,1415 @@ public:
     }
   }
 };
+}
+
+// Computes y^T*A*x
+// (modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
+
+#if ( ! defined( KOKKOS_HAVE_CUDA ) ) || defined( KOKKOS_CUDA_USE_LAMBDA )
+
+template< typename ScalarType , class DeviceType >
+class TestTripleNestedReduce
+{
+public:
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestTripleNestedReduce( const size_type & nrows , const size_type & ncols 
+                        , const size_type & team_size , const size_type & vector_length )
+  {
+    run_test( nrows , ncols , team_size, vector_length );
+  }
+
+  void run_test( const size_type & nrows , const size_type & ncols 
+               , const size_type & team_size, const size_type & vector_length )
+  {
+    //typedef Kokkos::LayoutLeft Layout;
+    typedef Kokkos::LayoutRight Layout;
+
+    typedef Kokkos::View<ScalarType* , DeviceType>            ViewVector;
+    typedef Kokkos::View<ScalarType** , Layout , DeviceType>   ViewMatrix;
+    ViewVector y( "y" , nrows );
+    ViewVector x( "x" , ncols );
+    ViewMatrix A( "A" , nrows , ncols );
+
+    typedef Kokkos::RangePolicy<DeviceType> range_policy;
+
+    // Initialize y vector
+    Kokkos::parallel_for( range_policy( 0 , nrows ) , KOKKOS_LAMBDA( const int i ) { y( i ) = 1; } );
+
+    // Initialize x vector
+    Kokkos::parallel_for( range_policy( 0 , ncols ) , KOKKOS_LAMBDA( const int i ) { x( i ) = 1; } );
+
+    typedef Kokkos::TeamPolicy<DeviceType>                        team_policy;
+    typedef typename Kokkos::TeamPolicy<DeviceType>::member_type  member_type;
+
+    // Initialize A matrix, note 2D indexing computation
+    Kokkos::parallel_for( team_policy( nrows , Kokkos::AUTO ) , KOKKOS_LAMBDA( const member_type& teamMember ) {
+      const int j = teamMember.league_rank();
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , ncols ) , [&] ( const int i ) {
+        A( j , i ) = 1;
+      } );
+    } );
+
+    // Three level parallelism kernel to force caching of vector x 
+    ScalarType result = 0.0;
+    int chunk_size = 128;
+    Kokkos::parallel_reduce( team_policy( nrows/chunk_size , team_size , vector_length ) , KOKKOS_LAMBDA ( const member_type& teamMember , double &update ) {
+      const int row_start = teamMember.league_rank() * chunk_size;
+      const int row_end   = row_start + chunk_size;
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , row_start , row_end ) , [&] ( const int i ) {
+        ScalarType sum_i = 0.0;
+        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember , ncols ) , [&] ( const int j , ScalarType &innerUpdate ) {
+          innerUpdate += A( i , j ) * x( j );
+        } , sum_i );
+        Kokkos::single( Kokkos::PerThread( teamMember ) , [&] () {
+          update += y( i ) * sum_i;
+        } );
+      } );
+    } , result );
+
+    const ScalarType solution= ( ScalarType ) nrows * ( ScalarType ) ncols;
+    ASSERT_EQ( solution , result );
+  }
+};
+
+#else /* #if ( ! defined( KOKKOS_HAVE_CUDA ) ) || defined( KOKKOS_CUDA_USE_LAMBDA ) */
+
+template< typename ScalarType , class DeviceType >
+class TestTripleNestedReduce
+{
+public:
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  TestTripleNestedReduce( const size_type & , const size_type  
+                        , const size_type & , const size_type )
+  { }
+};
+
+#endif
+
+//--------------------------------------------------------------------------
+
+namespace Test {
+namespace ReduceCombinatorical {
+
+template<class Scalar,class Space = Kokkos::HostSpace>
+struct AddPlus {
+public:
+  //Required
+  typedef AddPlus reducer_type;
+  typedef Scalar value_type;
+
+  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+
+private:
+  result_view_type result;
+
+public:
+
+  AddPlus(value_type& result_):result(&result_) {}
+
+  //Required
+  KOKKOS_INLINE_FUNCTION
+  void join(value_type& dest, const value_type& src)  const {
+    dest += src + 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& dest, const volatile value_type& src) const {
+    dest += src + 1;
+  }
+
+  //Optional
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type& val)  const {
+    val = value_type();
+  }
+
+  result_view_type result_view() const {
+    return result;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalar;
+
+template<>
+struct FunctorScalar<0>{
+  FunctorScalar(Kokkos::View<double> r):result(r) {}
+  Kokkos::View<double> result;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i,double& update) const {
+    update+=i;
+  }
+};
+
+template<>
+struct FunctorScalar<1>{
+  FunctorScalar(Kokkos::View<double> r):result(r) {}
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarInit;
+
+template<>
+struct FunctorScalarInit<0> {
+  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarInit<1> {
+  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarFinal;
+
+
+template<>
+struct FunctorScalarFinal<0> {
+  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<>
+struct FunctorScalarFinal<1> {
+  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team, double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoin;
+
+template<>
+struct FunctorScalarJoin<0> {
+  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+};
+
+template<>
+struct FunctorScalarJoin<1> {
+  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoinFinal;
+
+template<>
+struct FunctorScalarJoinFinal<0> {
+  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<>
+struct FunctorScalarJoinFinal<1> {
+  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoinInit;
+
+template<>
+struct FunctorScalarJoinInit<0> {
+  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarJoinInit<1> {
+  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<int ISTEAM>
+struct FunctorScalarJoinFinalInit;
+
+template<>
+struct FunctorScalarJoinFinalInit<0> {
+  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, double& update)  const {
+    update += i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+
+template<>
+struct FunctorScalarJoinFinalInit<1> {
+  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
+
+  Kokkos::View<double> result;
+
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_type& team,double& update) const {
+    update+=1.0/team.team_size()*team.league_rank();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile double& dst, const volatile double& update) const {
+    dst += update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void final(double& update) const {
+    result() = update;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& update) const {
+    update = 0.0;
+  }
+};
+struct Functor1 {
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i,double& update) const {
+    update+=i;
+  }
+};
+
+struct Functor2 {
+  typedef double value_type[];
+  const unsigned value_count;
+
+  Functor2(unsigned n):value_count(n){}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const unsigned& i,double update[]) const {
+    for(unsigned j=0;j<value_count;j++)
+      update[j]+=i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( double dst[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile double dst[] ,
+             const volatile double src[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+  }
+};
+
+}
+}
+
+namespace Test {
+
+template<class ExecSpace = Kokkos::DefaultExecutionSpace>
+struct TestReduceCombinatoricalInstantiation {
+  template<class ... Args>
+  static void CallParallelReduce(Args... args) {
+    Kokkos::parallel_reduce(args...);
+  }
+
+  template<class ... Args>
+  static void AddReturnArgument(Args... args) {
+    Kokkos::View<double,Kokkos::HostSpace> result_view("ResultView");
+    double expected_result = 1000.0*999.0/2.0;
+
+    double value = 0;
+    Kokkos::parallel_reduce(args...,value);
+    ASSERT_EQ(expected_result,value);
+
+    result_view() = 0;
+    CallParallelReduce(args...,result_view);
+    ASSERT_EQ(expected_result,result_view());
+
+    value = 0;
+    CallParallelReduce(args...,Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&value));
+    ASSERT_EQ(expected_result,value);
+
+    result_view() = 0;
+    const Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> result_view_const_um = result_view;
+    CallParallelReduce(args...,result_view_const_um);
+    ASSERT_EQ(expected_result,result_view_const_um());
+
+    value = 0;
+    CallParallelReduce(args...,Test::ReduceCombinatorical::AddPlus<double>(value));
+    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<value);
+    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<=value);
+    else
+      ASSERT_EQ(expected_result,value);
+
+    value = 0;
+    Test::ReduceCombinatorical::AddPlus<double> add(value);
+    CallParallelReduce(args...,add);
+    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<value);
+    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
+      ASSERT_TRUE(expected_result<=value);
+    else
+      ASSERT_EQ(expected_result,value);
+  }
+
+
+  template<class ... Args>
+  static void AddLambdaRange(void*,Args... args) {
+    AddReturnArgument(args...,  KOKKOS_LAMBDA (const int&i , double& lsum) {
+      lsum += i;
+    });
+  }
+
+  template<class ... Args>
+  static void AddLambdaTeam(void*,Args... args) {
+    AddReturnArgument(args..., KOKKOS_LAMBDA (const Kokkos::TeamPolicy<>::member_type& team, double& update) {
+      update+=1.0/team.team_size()*team.league_rank();
+    });
+  }
+
+  template<class ... Args>
+  static void AddLambdaRange(Kokkos::InvalidType,Args... args) {
+  }
+
+  template<class ... Args>
+  static void AddLambdaTeam(Kokkos::InvalidType,Args... args) {
+  }
+
+  template<int ISTEAM, class ... Args>
+  static void AddFunctor(Args... args) {
+    Kokkos::View<double> result_view("FunctorView");
+    auto h_r = Kokkos::create_mirror_view(result_view);
+    Test::ReduceCombinatorical::FunctorScalar<ISTEAM> functor(result_view);
+    double expected_result = 1000.0*999.0/2.0;
+
+    AddReturnArgument(args..., functor);
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalar<ISTEAM>(result_view));
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarInit<ISTEAM>(result_view));
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoin<ISTEAM>(result_view));
+    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoinInit<ISTEAM>(result_view));
+
+    h_r() = 0;
+    Kokkos::deep_copy(result_view,h_r);
+    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarFinal<ISTEAM>(result_view));
+    Kokkos::deep_copy(h_r,result_view);
+    ASSERT_EQ(expected_result,h_r());
+
+    h_r() = 0;
+    Kokkos::deep_copy(result_view,h_r);
+    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal<ISTEAM>(result_view));
+    Kokkos::deep_copy(h_r,result_view);
+    ASSERT_EQ(expected_result,h_r());
+
+    h_r() = 0;
+    Kokkos::deep_copy(result_view,h_r);
+    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit<ISTEAM>(result_view));
+    Kokkos::deep_copy(h_r,result_view);
+    ASSERT_EQ(expected_result,h_r());
+  }
+
+  template<class ... Args>
+  static void AddFunctorLambdaRange(Args... args) {
+    AddFunctor<0,Args...>(args...);
+    #ifdef  KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+    AddLambdaRange(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
+    #endif
+  }
+
+  template<class ... Args>
+  static void AddFunctorLambdaTeam(Args... args) {
+    AddFunctor<1,Args...>(args...);
+    #ifdef  KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+    AddLambdaTeam(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
+    #endif
+  }
+
+  template<class ... Args>
+  static void AddPolicy(Args... args) {
+    int N = 1000;
+    Kokkos::RangePolicy<ExecSpace> policy(0,N);
+
+    AddFunctorLambdaRange(args...,1000);
+    AddFunctorLambdaRange(args...,N);
+    AddFunctorLambdaRange(args...,policy);
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace>(0,N));
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N));
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(0,N).set_chunk_size(10));
+    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N).set_chunk_size(10));
+
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace>(N,Kokkos::AUTO));
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO));
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(N,Kokkos::AUTO).set_chunk_size(10));
+    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO).set_chunk_size(10));
+  }
+
+
+  static void AddLabel() {
+    std::string s("Std::String");
+    AddPolicy();
+    AddPolicy("Char Constant");
+    AddPolicy(s.c_str());
+    AddPolicy(s);
+  }
+
+  static void execute() {
+    AddLabel();
+  }
+};
+
+template<class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace>
+struct TestReducers {
+
+  struct SumFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value += values(i);
+    }
+  };
+
+  struct ProdFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value *= values(i);
+    }
+  };
+
+  struct MinFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      if(values(i) < value)
+        value = values(i);
+    }
+  };
+
+  struct MaxFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      if(values(i) > value)
+        value = values(i);
+    }
+  };
+
+  struct MinLocFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i,
+        typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type& value) const {
+      if(values(i) < value.val) {
+        value.val = values(i);
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MaxLocFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i,
+        typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type& value) const {
+      if(values(i) > value.val) {
+        value.val = values(i);
+        value.loc = i;
+      }
+    }
+  };
+
+  struct MinMaxLocFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i,
+        typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type& value) const {
+      if(values(i) > value.max_val) {
+        value.max_val = values(i);
+        value.max_loc = i;
+      }
+      if(values(i) < value.min_val) {
+        value.min_val = values(i);
+        value.min_loc = i;
+      }
+    }
+  };
+
+  struct BAndFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value & values(i);
+    }
+  };
+
+  struct BOrFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value | values(i);
+    }
+  };
+
+  struct BXorFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value ^ values(i);
+    }
+  };
+
+  struct LAndFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value && values(i);
+    }
+  };
+
+  struct LOrFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value || values(i);
+    }
+  };
+
+  struct LXorFunctor {
+    Kokkos::View<const Scalar*,ExecSpace> values;
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i, Scalar& value) const {
+      value = value ? (!values(i)) : values(i);
+    }
+  };
+
+  static void test_sum(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_sum = 0;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100);
+      reference_sum += h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    SumFunctor f;
+    f.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar sum_scalar = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_scalar(sum_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(sum_scalar,reference_sum);
+      Scalar sum_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(sum_scalar_view,reference_sum);
+    }
+    {
+      Scalar sum_scalar_init = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_scalar_init(sum_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(sum_scalar_init,reference_sum);
+      Scalar sum_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(sum_scalar_init_view,reference_sum);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view("View");
+      sum_view() = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_view(sum_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar sum_view_scalar = sum_view();
+      ASSERT_EQ(sum_view_scalar,reference_sum);
+      Scalar sum_view_view = reducer_view.result_view()();
+      ASSERT_EQ(sum_view_view,reference_sum);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view_init("View");
+      sum_view_init() = init;
+      Kokkos::Experimental::Sum<Scalar> reducer_view_init(sum_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar sum_view_init_scalar = sum_view_init();
+      ASSERT_EQ(sum_view_init_scalar,reference_sum);
+      Scalar sum_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(sum_view_init_view,reference_sum);
+    }
+  }
+
+  static void test_prod(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_prod = 1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%4+1);
+      reference_prod *= h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    ProdFunctor f;
+    f.values = values;
+    Scalar init = 1;
+
+    if(std::is_arithmetic<Scalar>::value)
+    {
+      Scalar prod_scalar = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_scalar(prod_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(prod_scalar,reference_prod);
+      Scalar prod_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(prod_scalar_view,reference_prod);
+    }
+    {
+      Scalar prod_scalar_init = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_scalar_init(prod_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(prod_scalar_init,reference_prod);
+      Scalar prod_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(prod_scalar_init_view,reference_prod);
+    }
+
+    if(std::is_arithmetic<Scalar>::value)
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view("View");
+      prod_view() = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_view(prod_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar prod_view_scalar = prod_view();
+      ASSERT_EQ(prod_view_scalar,reference_prod);
+      Scalar prod_view_view = reducer_view.result_view()();
+      ASSERT_EQ(prod_view_view,reference_prod);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view_init("View");
+      prod_view_init() = init;
+      Kokkos::Experimental::Prod<Scalar> reducer_view_init(prod_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar prod_view_init_scalar = prod_view_init();
+      ASSERT_EQ(prod_view_init_scalar,reference_prod);
+      Scalar prod_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(prod_view_init_view,reference_prod);
+    }
+  }
+
+  static void test_min(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_min = std::numeric_limits<Scalar>::max();
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000);
+      if(h_values(i)<reference_min)
+        reference_min = h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MinFunctor f;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::max();
+
+    {
+      Scalar min_scalar = init;
+      Kokkos::Experimental::Min<Scalar> reducer_scalar(min_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(min_scalar,reference_min);
+      Scalar min_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(min_scalar_view,reference_min);
+    }
+    {
+      Scalar min_scalar_init = init;
+      Kokkos::Experimental::Min<Scalar> reducer_scalar_init(min_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(min_scalar_init,reference_min);
+      Scalar min_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(min_scalar_init_view,reference_min);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> min_view("View");
+      min_view() = init;
+      Kokkos::Experimental::Min<Scalar> reducer_view(min_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar min_view_scalar = min_view();
+      ASSERT_EQ(min_view_scalar,reference_min);
+      Scalar min_view_view = reducer_view.result_view()();
+      ASSERT_EQ(min_view_view,reference_min);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> min_view_init("View");
+      min_view_init() = init;
+      Kokkos::Experimental::Min<Scalar> reducer_view_init(min_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar min_view_init_scalar = min_view_init();
+      ASSERT_EQ(min_view_init_scalar,reference_min);
+      Scalar min_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(min_view_init_view,reference_min);
+    }
+  }
+
+  static void test_max(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_max = std::numeric_limits<Scalar>::min();
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000+1);
+      if(h_values(i)>reference_max)
+        reference_max = h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MaxFunctor f;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::min();
+
+    {
+      Scalar max_scalar = init;
+      Kokkos::Experimental::Max<Scalar> reducer_scalar(max_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(max_scalar,reference_max);
+      Scalar max_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(max_scalar_view,reference_max);
+    }
+    {
+      Scalar max_scalar_init = init;
+      Kokkos::Experimental::Max<Scalar> reducer_scalar_init(max_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(max_scalar_init,reference_max);
+      Scalar max_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(max_scalar_init_view,reference_max);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> max_view("View");
+      max_view() = init;
+      Kokkos::Experimental::Max<Scalar> reducer_view(max_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar max_view_scalar = max_view();
+      ASSERT_EQ(max_view_scalar,reference_max);
+      Scalar max_view_view = reducer_view.result_view()();
+      ASSERT_EQ(max_view_view,reference_max);
+    }
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> max_view_init("View");
+      max_view_init() = init;
+      Kokkos::Experimental::Max<Scalar> reducer_view_init(max_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Scalar max_view_init_scalar = max_view_init();
+      ASSERT_EQ(max_view_init_scalar,reference_max);
+      Scalar max_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(max_view_init_view,reference_max);
+    }
+  }
+
+  static void test_minloc(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_min = std::numeric_limits<Scalar>::max();
+    int reference_loc = -1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000);
+      if(h_values(i)<reference_min) {
+        reference_min = h_values(i);
+        reference_loc = i;
+      }
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MinLocFunctor f;
+    typedef typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type value_type;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::max();
+
+
+    {
+      value_type min_scalar;
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar(min_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(min_scalar.val,reference_min);
+      ASSERT_EQ(min_scalar.loc,reference_loc);
+      value_type min_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(min_scalar_view.val,reference_min);
+      ASSERT_EQ(min_scalar_view.loc,reference_loc);
+    }
+    {
+      value_type min_scalar_init;
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar_init(min_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(min_scalar_init.val,reference_min);
+      ASSERT_EQ(min_scalar_init.loc,reference_loc);
+      value_type min_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(min_scalar_init_view.val,reference_min);
+      ASSERT_EQ(min_scalar_init_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> min_view("View");
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view(min_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      value_type min_view_scalar = min_view();
+      ASSERT_EQ(min_view_scalar.val,reference_min);
+      ASSERT_EQ(min_view_scalar.loc,reference_loc);
+      value_type min_view_view = reducer_view.result_view()();
+      ASSERT_EQ(min_view_view.val,reference_min);
+      ASSERT_EQ(min_view_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> min_view_init("View");
+      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view_init(min_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      value_type min_view_init_scalar = min_view_init();
+      ASSERT_EQ(min_view_init_scalar.val,reference_min);
+      ASSERT_EQ(min_view_init_scalar.loc,reference_loc);
+      value_type min_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(min_view_init_view.val,reference_min);
+      ASSERT_EQ(min_view_init_view.loc,reference_loc);
+    }
+  }
+
+  static void test_maxloc(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_max = std::numeric_limits<Scalar>::min();
+    int reference_loc = -1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000);
+      if(h_values(i)>reference_max) {
+        reference_max = h_values(i);
+        reference_loc = i;
+      }
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    MaxLocFunctor f;
+    typedef typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type value_type;
+    f.values = values;
+    Scalar init = std::numeric_limits<Scalar>::min();
+
+
+    {
+      value_type max_scalar;
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar(max_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(max_scalar.val,reference_max);
+      ASSERT_EQ(max_scalar.loc,reference_loc);
+      value_type max_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(max_scalar_view.val,reference_max);
+      ASSERT_EQ(max_scalar_view.loc,reference_loc);
+    }
+    {
+      value_type max_scalar_init;
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar_init(max_scalar_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+      ASSERT_EQ(max_scalar_init.val,reference_max);
+      ASSERT_EQ(max_scalar_init.loc,reference_loc);
+      value_type max_scalar_init_view = reducer_scalar_init.result_view()();
+      ASSERT_EQ(max_scalar_init_view.val,reference_max);
+      ASSERT_EQ(max_scalar_init_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> max_view("View");
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view(max_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      value_type max_view_scalar = max_view();
+      ASSERT_EQ(max_view_scalar.val,reference_max);
+      ASSERT_EQ(max_view_scalar.loc,reference_loc);
+      value_type max_view_view = reducer_view.result_view()();
+      ASSERT_EQ(max_view_view.val,reference_max);
+      ASSERT_EQ(max_view_view.loc,reference_loc);
+    }
+    {
+      Kokkos::View<value_type,Kokkos::HostSpace> max_view_init("View");
+      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view_init(max_view_init,init);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      value_type max_view_init_scalar = max_view_init();
+      ASSERT_EQ(max_view_init_scalar.val,reference_max);
+      ASSERT_EQ(max_view_init_scalar.loc,reference_loc);
+      value_type max_view_init_view = reducer_view_init.result_view()();
+      ASSERT_EQ(max_view_init_view.val,reference_max);
+      ASSERT_EQ(max_view_init_view.loc,reference_loc);
+    }
+  }
+
+  static void test_minmaxloc(int N) {
+     Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+     auto h_values = Kokkos::create_mirror_view(values);
+     Scalar reference_max = std::numeric_limits<Scalar>::min();
+     Scalar reference_min = std::numeric_limits<Scalar>::max();
+     int reference_minloc = -1;
+     int reference_maxloc = -1;
+     for(int i=0; i<N; i++) {
+       h_values(i) = (Scalar)(rand()%100000);
+       if(h_values(i)>reference_max) {
+         reference_max = h_values(i);
+         reference_maxloc = i;
+       }
+       if(h_values(i)<reference_min) {
+         reference_min = h_values(i);
+         reference_minloc = i;
+       }
+     }
+     Kokkos::deep_copy(values,h_values);
+
+     MinMaxLocFunctor f;
+     typedef typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type value_type;
+     f.values = values;
+     Scalar init_min = std::numeric_limits<Scalar>::max();
+     Scalar init_max = std::numeric_limits<Scalar>::min();
+
+
+     {
+       value_type minmax_scalar;
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar(minmax_scalar);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+       ASSERT_EQ(minmax_scalar.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar.max_loc,reference_maxloc);
+       value_type minmax_scalar_view = reducer_scalar.result_view()();
+       ASSERT_EQ(minmax_scalar_view.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar_view.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar_view.max_loc,reference_maxloc);
+     }
+     {
+       value_type minmax_scalar_init;
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar_init(minmax_scalar_init,init_min,init_max);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
+       ASSERT_EQ(minmax_scalar_init.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar_init.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar_init.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar_init.max_loc,reference_maxloc);
+       value_type minmax_scalar_init_view = reducer_scalar_init.result_view()();
+       ASSERT_EQ(minmax_scalar_init_view.min_val,reference_min);
+       ASSERT_EQ(minmax_scalar_init_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_scalar_init_view.max_val,reference_max);
+       ASSERT_EQ(minmax_scalar_init_view.max_loc,reference_maxloc);
+     }
+     {
+       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view("View");
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view(minmax_view);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+       value_type minmax_view_scalar = minmax_view();
+       ASSERT_EQ(minmax_view_scalar.min_val,reference_min);
+       ASSERT_EQ(minmax_view_scalar.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_scalar.max_val,reference_max);
+       ASSERT_EQ(minmax_view_scalar.max_loc,reference_maxloc);
+       value_type minmax_view_view = reducer_view.result_view()();
+       ASSERT_EQ(minmax_view_view.min_val,reference_min);
+       ASSERT_EQ(minmax_view_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_view.max_val,reference_max);
+       ASSERT_EQ(minmax_view_view.max_loc,reference_maxloc);
+     }
+     {
+       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view_init("View");
+       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view_init(minmax_view_init,init_min,init_max);
+       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+       value_type minmax_view_init_scalar = minmax_view_init();
+       ASSERT_EQ(minmax_view_init_scalar.min_val,reference_min);
+       ASSERT_EQ(minmax_view_init_scalar.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_init_scalar.max_val,reference_max);
+       ASSERT_EQ(minmax_view_init_scalar.max_loc,reference_maxloc);
+       value_type minmax_view_init_view = reducer_view_init.result_view()();
+       ASSERT_EQ(minmax_view_init_view.min_val,reference_min);
+       ASSERT_EQ(minmax_view_init_view.min_loc,reference_minloc);
+       ASSERT_EQ(minmax_view_init_view.max_val,reference_max);
+       ASSERT_EQ(minmax_view_init_view.max_loc,reference_maxloc);
+     }
+   }
+
+  static void test_BAnd(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_band = Scalar() | (~Scalar());
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%100000+1);
+      reference_band = reference_band & h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    BAndFunctor f;
+    f.values = values;
+    Scalar init = Scalar() | (~Scalar());
+
+    {
+      Scalar band_scalar = init;
+      Kokkos::Experimental::BAnd<Scalar> reducer_scalar(band_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(band_scalar,reference_band);
+      Scalar band_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(band_scalar_view,reference_band);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> band_view("View");
+      band_view() = init;
+      Kokkos::Experimental::BAnd<Scalar> reducer_view(band_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar band_view_scalar = band_view();
+      ASSERT_EQ(band_view_scalar,reference_band);
+      Scalar band_view_view = reducer_view.result_view()();
+      ASSERT_EQ(band_view_view,reference_band);
+    }
+  }
+
+  static void test_BOr(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_bor = Scalar() & (~Scalar());
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)((rand()%100000+1)*2);
+      reference_bor = reference_bor | h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    BOrFunctor f;
+    f.values = values;
+    Scalar init = Scalar() & (~Scalar());
+
+    {
+      Scalar bor_scalar = init;
+      Kokkos::Experimental::BOr<Scalar> reducer_scalar(bor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(bor_scalar,reference_bor);
+      Scalar bor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(bor_scalar_view,reference_bor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> bor_view("View");
+      bor_view() = init;
+      Kokkos::Experimental::BOr<Scalar> reducer_view(bor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar bor_view_scalar = bor_view();
+      ASSERT_EQ(bor_view_scalar,reference_bor);
+      Scalar bor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(bor_view_view,reference_bor);
+    }
+  }
+
+  static void test_BXor(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_bxor = Scalar() & (~Scalar());
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)((rand()%100000+1)*2);
+      reference_bxor = reference_bxor ^ h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    BXorFunctor f;
+    f.values = values;
+    Scalar init = Scalar() & (~Scalar());
+
+    {
+      Scalar bxor_scalar = init;
+      Kokkos::Experimental::BXor<Scalar> reducer_scalar(bxor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(bxor_scalar,reference_bxor);
+      Scalar bxor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(bxor_scalar_view,reference_bxor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> bxor_view("View");
+      bxor_view() = init;
+      Kokkos::Experimental::BXor<Scalar> reducer_view(bxor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar bxor_view_scalar = bxor_view();
+      ASSERT_EQ(bxor_view_scalar,reference_bxor);
+      Scalar bxor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(bxor_view_view,reference_bxor);
+    }
+  }
 
+  static void test_LAnd(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_land = 1;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%2);
+      reference_land = reference_land && h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    LAndFunctor f;
+    f.values = values;
+    Scalar init = 1;
+
+    {
+      Scalar land_scalar = init;
+      Kokkos::Experimental::LAnd<Scalar> reducer_scalar(land_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(land_scalar,reference_land);
+      Scalar land_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(land_scalar_view,reference_land);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> land_view("View");
+      land_view() = init;
+      Kokkos::Experimental::LAnd<Scalar> reducer_view(land_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar land_view_scalar = land_view();
+      ASSERT_EQ(land_view_scalar,reference_land);
+      Scalar land_view_view = reducer_view.result_view()();
+      ASSERT_EQ(land_view_view,reference_land);
+    }
+  }
+
+  static void test_LOr(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_lor = 0;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%2);
+      reference_lor = reference_lor || h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    LOrFunctor f;
+    f.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar lor_scalar = init;
+      Kokkos::Experimental::LOr<Scalar> reducer_scalar(lor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(lor_scalar,reference_lor);
+      Scalar lor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(lor_scalar_view,reference_lor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> lor_view("View");
+      lor_view() = init;
+      Kokkos::Experimental::LOr<Scalar> reducer_view(lor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar lor_view_scalar = lor_view();
+      ASSERT_EQ(lor_view_scalar,reference_lor);
+      Scalar lor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(lor_view_view,reference_lor);
+    }
+  }
+
+  static void test_LXor(int N) {
+    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
+    auto h_values = Kokkos::create_mirror_view(values);
+    Scalar reference_lxor = 0;
+    for(int i=0; i<N; i++) {
+      h_values(i) = (Scalar)(rand()%2);
+      reference_lxor = reference_lxor ? (!h_values(i)) : h_values(i);
+    }
+    Kokkos::deep_copy(values,h_values);
+
+    LXorFunctor f;
+    f.values = values;
+    Scalar init = 0;
+
+    {
+      Scalar lxor_scalar = init;
+      Kokkos::Experimental::LXor<Scalar> reducer_scalar(lxor_scalar);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
+      ASSERT_EQ(lxor_scalar,reference_lxor);
+      Scalar lxor_scalar_view = reducer_scalar.result_view()();
+      ASSERT_EQ(lxor_scalar_view,reference_lxor);
+    }
+
+    {
+      Kokkos::View<Scalar,Kokkos::HostSpace> lxor_view("View");
+      lxor_view() = init;
+      Kokkos::Experimental::LXor<Scalar> reducer_view(lxor_view);
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Scalar lxor_view_scalar = lxor_view();
+      ASSERT_EQ(lxor_view_scalar,reference_lxor);
+      Scalar lxor_view_view = reducer_view.result_view()();
+      ASSERT_EQ(lxor_view_view,reference_lxor);
+    }
+  }
+
+  static void execute_float() {
+    test_sum(10001);
+    test_prod(35);
+    test_min(10003);
+    test_minloc(10003);
+    test_max(10007);
+    test_maxloc(10007);
+    test_minmaxloc(10007);
+  }
+
+  static void execute_integer() {
+    test_sum(10001);
+    test_prod(35);
+    test_min(10003);
+    test_minloc(10003);
+    test_max(10007);
+    test_maxloc(10007);
+    test_minmaxloc(10007);
+    test_BAnd(35);
+    test_BOr(35);
+    test_BXor(35);
+    test_LAnd(35);
+    test_LOr(35);
+    test_LXor(35);
+  }
+
+  static void execute_basic() {
+    test_sum(10001);
+    test_prod(35);
+  }
+};
 }
 
 /*--------------------------------------------------------------------------*/
diff --git a/lib/kokkos/core/unit_test/TestSerial.cpp b/lib/kokkos/core/unit_test/TestSerial.cpp
index 7ddb54241c..d85614e66e 100644
--- a/lib/kokkos/core/unit_test/TestSerial.cpp
+++ b/lib/kokkos/core/unit_test/TestSerial.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -66,6 +66,7 @@
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
 #include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
 #include <TestRange.hpp>
 #include <TestTeam.hpp>
 #include <TestReduce.hpp>
@@ -85,6 +86,8 @@
 
 #include <TestPolicyConstruction.hpp>
 
+#include <TestMDRange.hpp>
+
 namespace Test {
 
 class serial : public ::testing::Test {
@@ -99,6 +102,12 @@ protected:
     }
 };
 
+TEST_F( serial , md_range ) {
+  TestMDRange_2D< Kokkos::Serial >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::Serial >::test_for3(100,100,100);
+}
+
 TEST_F( serial , impl_shared_alloc ) {
   test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >();
 }
@@ -199,6 +208,14 @@ TEST_F( serial, double_reduce) {
   TestReduce< double ,   Kokkos::Serial >( 1000000 );
 }
 
+TEST_F( serial , reducers )
+{
+  TestReducers<int, Kokkos::Serial>::execute_integer();
+  TestReducers<size_t, Kokkos::Serial>::execute_integer();
+  TestReducers<double, Kokkos::Serial>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::Serial>::execute_basic();
+}
+
 TEST_F( serial, long_reduce_dynamic ) {
   TestReduceDynamic< long ,   Kokkos::Serial >( 1000000 );
 }
@@ -237,13 +254,17 @@ TEST_F( serial , team_shared_request) {
   TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) 
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
 TEST_F( serial , team_lambda_shared_request) {
   TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
+TEST_F( serial, shmem_size) {
+  TestShmemSize< Kokkos::Serial >();
+}
+
 TEST_F( serial  , team_scan )
 {
   TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10 );
@@ -345,6 +366,74 @@ TEST_F( serial , atomics )
   ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,3) ) );
 }
 
+TEST_F( serial , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 4 ) ) );
+  }
+
+}
 //----------------------------------------------------------------------------
 
 TEST_F( serial, tile_layout )
@@ -391,12 +480,36 @@ TEST_F( serial , memory_pool )
   bool val = TestMemoryPool::test_mempool< Kokkos::Serial >( 128, 128000000 );
   ASSERT_TRUE( val );
 
-  TestMemoryPool::test_mempool2< Kokkos::Serial >( 128, 128000000 );
+  TestMemoryPool::test_mempool2< Kokkos::Serial >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Serial >();
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( serial , task_policy )
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+TEST_F( serial , task_fib )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestFib< Kokkos::Serial >::run(i);
+  }
+}
+
+TEST_F( serial , task_depend )
+{
+  for ( int i = 0 ; i < 25 ; ++i ) {
+    TestTaskPolicy::TestTaskDependence< Kokkos::Serial >::run(i);
+  }
+}
+
+TEST_F( serial , task_team )
+{
+  TestTaskPolicy::TestTaskTeam< Kokkos::Serial >::run(1000);
+  //TestTaskPolicy::TestTaskTeamValue< Kokkos::Serial >::run(1000); //put back after testing
+}
+
+TEST_F( serial , old_task_policy )
 {
   TestTaskPolicy::test_task_dep< Kokkos::Serial >( 10 );
   // TestTaskPolicy::test_norm2< Kokkos::Serial >( 1000 );
@@ -406,11 +519,13 @@ TEST_F( serial , task_policy )
   for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i);
 }
 
-TEST_F( serial , task_team )
+TEST_F( serial , old_task_team )
 {
   TestTaskPolicy::test_task_team< Kokkos::Serial >(1000);
 }
 
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
 //----------------------------------------------------------------------------
 
 TEST_F( serial , template_meta_functions )
diff --git a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
index e5c461af01..71790f6def 100644
--- a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
+++ b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
@@ -50,10 +50,489 @@
 #include <cmath>
 #include <Kokkos_TaskPolicy.hpp>
 
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace TestTaskPolicy {
+
+namespace {
+
+long eval_fib( long n )
+{
+  constexpr long mask = 0x03 ;
+
+  long fib[4] = { 0 , 1 , 1 , 2 };
+
+  for ( long i = 2 ; i <= n ; ++i ) {
+    fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
+  }
+  
+  return fib[ n & mask ];
+}
+
+}
+
+template< typename Space >
+struct TestFib
+{
+  typedef Kokkos::TaskPolicy<Space>  policy_type ;
+  typedef Kokkos::Future<long,Space> future_type ;
+  typedef long value_type ;
+
+  policy_type policy ;
+  future_type fib_m1 ;
+  future_type fib_m2 ;
+  const value_type n ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestFib( const policy_type & arg_policy , const value_type arg_n )
+    : policy(arg_policy)
+    , fib_m1() , fib_m2()
+    , n( arg_n )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type & , value_type & result )
+    {
+#if 0
+      printf( "\nTestFib(%ld) %d %d\n"
+             , n
+             , int( ! fib_m1.is_null() )
+             , int( ! fib_m2.is_null() )
+             );
+#endif
+
+      if ( n < 2 ) {
+        result = n ;
+      }
+      else if ( ! fib_m2.is_null() && ! fib_m1.is_null() ) {
+        result = fib_m1.get() + fib_m2.get();
+      }
+      else {
+
+        // Spawn new children and respawn myself to sum their results:
+        // Spawn lower value at higher priority as it has a shorter
+        // path to completion.
+
+        fib_m2 = policy.task_spawn( TestFib(policy,n-2)
+                                  , Kokkos::TaskSingle
+                                  , Kokkos::TaskHighPriority );
+
+        fib_m1 = policy.task_spawn( TestFib(policy,n-1)
+                                  , Kokkos::TaskSingle );
+
+        Kokkos::Future<Space> dep[] = { fib_m1 , fib_m2 };
+
+        Kokkos::Future<Space> fib_all = policy.when_all( 2 , dep );
+
+        if ( ! fib_m2.is_null() && ! fib_m1.is_null() && ! fib_all.is_null() ) {
+          // High priority to retire this branch
+          policy.respawn( this , Kokkos::TaskHighPriority , fib_all );
+        }
+        else {
+#if 0
+      printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+             , n
+             , policy.allocation_capacity()
+             , policy.allocated_task_count_max()
+             , policy.allocated_task_count_accum()
+             );
+#endif
+          Kokkos::abort("TestFib insufficient memory");
+
+        }
+      }
+    }
+
+  static void run( int i , size_t MemoryCapacity = 16000 )
+    {
+      typedef typename policy_type::memory_space memory_space ;
+
+      enum { Log2_SuperBlockSize = 12 };
+
+      policy_type root_policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+
+      future_type f = root_policy.host_spawn( TestFib(root_policy,i) , Kokkos::TaskSingle );
+      Kokkos::wait( root_policy );
+      ASSERT_EQ( eval_fib(i) , f.get() );
+
+#if 0
+      fprintf( stdout , "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+             , i
+             , int(root_policy.template spawn_allocation_size<TestFib>())
+             , int(root_policy.when_all_allocation_size(2))
+             , root_policy.allocation_capacity()
+             , root_policy.allocated_task_count_max()
+             , root_policy.allocated_task_count_accum()
+             );
+      fflush( stdout );
+#endif
+    }
+
+};
+
+} // namespace TestTaskPolicy
+
+//----------------------------------------------------------------------------
+
 namespace TestTaskPolicy {
 
+template< class Space >
+struct TestTaskDependence {
+
+  typedef Kokkos::TaskPolicy<Space>  policy_type ;
+  typedef Kokkos::Future<Space>      future_type ;
+  typedef Kokkos::View<long,Space>   accum_type ;
+  typedef void value_type ;
+
+  policy_type  m_policy ;
+  accum_type   m_accum ;
+  long         m_count ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskDependence( long n
+                    , const policy_type & arg_policy
+                    , const accum_type  & arg_accum )
+    : m_policy( arg_policy )
+    , m_accum( arg_accum )
+    , m_count( n )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type & )
+    {
+       enum { CHUNK = 8 };
+       const int n = CHUNK < m_count ? CHUNK : m_count ;
+
+       if ( 1 < m_count ) {
+         future_type f[ CHUNK ] ;
+
+         const int inc = ( m_count + n - 1 ) / n ;
+
+         for ( int i = 0 ; i < n ; ++i ) {
+           long begin = i * inc ;
+           long count = begin + inc < m_count ? inc : m_count - begin ;
+           f[i] = m_policy.task_spawn( TestTaskDependence(count,m_policy,m_accum) , Kokkos::TaskSingle );
+         }
+
+         m_count = 0 ;
+
+         m_policy.respawn( this , m_policy.when_all( n , f ) );
+       }
+       else if ( 1 == m_count ) {
+         Kokkos::atomic_increment( & m_accum() );
+       }
+    }
+
+  static void run( int n )
+    {
+      typedef typename policy_type::memory_space memory_space ;
+
+      // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool
+      enum { MemoryCapacity = 16000 };
+      enum { Log2_SuperBlockSize = 12 };
+      policy_type policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+
+      accum_type accum("accum");
+
+      typename accum_type::HostMirror host_accum =
+        Kokkos::create_mirror_view( accum );
+
+      policy.host_spawn( TestTaskDependence(n,policy,accum) , Kokkos::TaskSingle );
+
+      Kokkos::wait( policy );
+
+      Kokkos::deep_copy( host_accum , accum );
+
+      ASSERT_EQ( host_accum() , n );
+    }
+};
+
+} // namespace TestTaskPolicy
+
 //----------------------------------------------------------------------------
 
+namespace TestTaskPolicy {
+
+template< class ExecSpace >
+struct TestTaskTeam {
+
+  //enum { SPAN = 8 };
+  enum { SPAN = 33 };
+  //enum { SPAN = 1 };
+
+  typedef void value_type ;
+  typedef Kokkos::TaskPolicy<ExecSpace>  policy_type ;
+  typedef Kokkos::Future<ExecSpace>      future_type ;
+  typedef Kokkos::View<long*,ExecSpace>  view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  parfor_result ;
+  view_type  parreduce_check ;
+  view_type  parscan_result ;
+  view_type  parscan_check ;
+  const long nvalue ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskTeam( const policy_type & arg_policy
+              , const view_type   & arg_parfor_result
+              , const view_type   & arg_parreduce_check
+              , const view_type   & arg_parscan_result
+              , const view_type   & arg_parscan_check
+              , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , parfor_result( arg_parfor_result )
+    , parreduce_check( arg_parreduce_check )
+    , parscan_result( arg_parscan_result )
+    , parscan_check( arg_parscan_check )
+    , nvalue( arg_nvalue )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type & member )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.is_null() ) {
+        if ( member.team_rank() == 0 ) {
+          future = policy.task_spawn
+            ( TestTaskTeam( policy ,
+                            parfor_result ,
+                            parreduce_check,
+                            parscan_result,
+                            parscan_check,
+                            begin - 1 )
+            , Kokkos::TaskTeam );
+
+          assert( ! future.is_null() );
+
+          policy.respawn( this , future );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { parfor_result[i] = i ; }
+                          );
+
+      // test parallel_reduce without join
+    
+      long tot = 0;
+      long expected = (begin+end-1)*(end-begin)*0.5;
+      
+      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &res) { res += parfor_result[i]; }
+                          , tot);
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { parreduce_check[i] = expected-tot ; }
+                          );
+
+      // test parallel_reduce with join
+
+      tot = 0;
+      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &res) { res += parfor_result[i]; }
+                          , [&]( long& val1, const long& val2) { val1 += val2; }
+                          , tot);
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { parreduce_check[i] += expected-tot ; }
+                          );
+
+#if 0
+      // test parallel_scan
+
+      // Exclusive scan
+      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &val , const bool final ) {
+                              if ( final ) { parscan_result[i] = val; }
+                              val += i;
+                            }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        for ( long i = begin ; i < end ; ++i ) {
+          parscan_check[i] = (i*(i-1)-begin*(begin-1))*0.5-parscan_result[i];
+        }
+      }
+
+      // Inclusive scan
+      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i, long &val , const bool final ) {
+                              val += i;
+                              if ( final ) { parscan_result[i] = val; }
+                            }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        for ( long i = begin ; i < end ; ++i ) {
+          parscan_check[i] += (i*(i+1)-begin*(begin-1))*0.5-parscan_result[i];
+        }
+      }
+#endif
+
+    }
+
+  static void run( long n )
+    {
+      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
+      // const unsigned memory_capacity = 100000 ; // fails with SPAN=1 for serial and OMP
+      const unsigned memory_capacity = 400000 ;
+
+      policy_type root_policy( typename policy_type::memory_space()
+                        , memory_capacity );
+
+      view_type   root_parfor_result("parfor_result",n+1);
+      view_type   root_parreduce_check("parreduce_check",n+1);
+      view_type   root_parscan_result("parscan_result",n+1);
+      view_type   root_parscan_check("parscan_check",n+1);
+
+      typename view_type::HostMirror
+        host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
+      typename view_type::HostMirror
+        host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
+      typename view_type::HostMirror
+        host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
+      typename view_type::HostMirror
+        host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
+
+      future_type f = root_policy.host_spawn(
+                        TestTaskTeam( root_policy ,
+                                      root_parfor_result ,
+                                      root_parreduce_check ,
+                                      root_parscan_result,
+                                      root_parscan_check,
+                                      n ) ,
+                        Kokkos::TaskTeam );
+
+      Kokkos::wait( root_policy );
+
+      Kokkos::deep_copy( host_parfor_result , root_parfor_result );
+      Kokkos::deep_copy( host_parreduce_check , root_parreduce_check );
+      Kokkos::deep_copy( host_parscan_result , root_parscan_result );
+      Kokkos::deep_copy( host_parscan_check , root_parscan_check );
+
+      for ( long i = 0 ; i <= n ; ++i ) {
+        const long answer = i ;
+        if ( host_parfor_result(i) != answer ) {
+          std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
+                    << host_parfor_result(i) << " != " << answer << std::endl ;
+        }
+        if ( host_parreduce_check(i) != 0 ) {
+          std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
+                    << host_parreduce_check(i) << " != 0" << std::endl ;
+        } //TODO
+        if ( host_parscan_check(i) != 0 ) {
+          std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
+                    << host_parscan_check(i) << " != 0" << std::endl ;
+        }
+      }
+    }
+};
+
+template< class ExecSpace >
+struct TestTaskTeamValue {
+
+  enum { SPAN = 8 };
+
+  typedef long value_type ;
+  typedef Kokkos::TaskPolicy<ExecSpace>         policy_type ;
+  typedef Kokkos::Future<value_type,ExecSpace>  future_type ;
+  typedef Kokkos::View<long*,ExecSpace>         view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  result ;
+  const long nvalue ;
+
+  KOKKOS_INLINE_FUNCTION
+  TestTaskTeamValue( const policy_type & arg_policy
+                   , const view_type   & arg_result
+                   , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type const & member
+                 , value_type & final )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.is_null() ) {
+        if ( member.team_rank() == 0 ) {
+
+          future = policy.task_spawn
+            ( TestTaskTeamValue( policy , result , begin - 1 )
+            , Kokkos::TaskTeam );
+
+          assert( ! future.is_null() );
+
+          policy.respawn( this , future );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { result[i] = i + 1 ; }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        final = result[nvalue] ;
+      }
+
+      Kokkos::memory_fence();
+    }
+
+  static void run( long n )
+    {
+      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
+      const unsigned memory_capacity = 100000 ;
+
+      policy_type root_policy( typename policy_type::memory_space()
+                             , memory_capacity );
+
+      view_type   root_result("result",n+1);
+
+      typename view_type::HostMirror
+        host_result = Kokkos::create_mirror_view( root_result );
+
+      future_type fv = root_policy.host_spawn
+        ( TestTaskTeamValue( root_policy, root_result, n ) , Kokkos::TaskTeam );
+
+      Kokkos::wait( root_policy );
+
+      Kokkos::deep_copy( host_result , root_result );
+
+      if ( fv.get() != n + 1 ) {
+        std::cerr << "TestTaskTeamValue ERROR future = "
+                  << fv.get() << " != " << n + 1 << std::endl ;
+      }
+      for ( long i = 0 ; i <= n ; ++i ) {
+        const long answer = i + 1 ;
+        if ( host_result(i) != answer ) {
+          std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
+                    << host_result(i) << " != " << answer << std::endl ;
+        }
+      }
+    }
+};
+} // namespace TestTaskPolicy
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace TestTaskPolicy {
+
 template< class ExecSpace >
 struct FibChild {
 
@@ -207,28 +686,8 @@ struct FibChild2 {
     }
 };
 
-namespace {
-
-long eval_fib( long n )
-{
-  if ( 2 <= n ) {
-    std::vector<long> fib(n+1);
-
-    fib[0] = 0 ;
-    fib[1] = 1 ;
-
-    for ( long i = 2 ; i <= n ; ++i ) { fib[i] = fib[i-2] + fib[i-1]; }
-
-    n = fib[n] ;
-  }
-
-  return n ;
-}
-
-}
-
 template< class ExecSpace >
-void test_fib( long n , const unsigned task_max_count = 1024 )
+void test_fib( long n , const unsigned task_max_count = 4096 )
 {
   const unsigned task_max_size   = 256 ;
   const unsigned task_dependence = 4 ;
@@ -654,9 +1113,15 @@ void test_latch( int n )
   typedef TaskLatchRun< ExecSpace >        task_type ;
   typedef typename task_type::policy_type  policy_type ;
 
-  // Primary + latch + n*LatchAdd
-  const unsigned task_max_count  = n + 2 ;
-  const unsigned task_max_size   = sizeof(task_type);
+  // Primary + latch + n * LatchAdd
+  //
+  // This test uses several two different block sizes for allocation from the
+  // memory pool, so the memory size requested must be big enough to cause two
+  // or more superblocks to be used.  Currently, the superblock size in the
+  // task policy is 2^16, so make the minimum requested memory size greater
+  // than this.
+  const unsigned task_max_count  = n + 2 < 256 ? 256 : n + 2;
+  const unsigned task_max_size   = 256;
   const unsigned task_dependence = 4 ;
 
   policy_type
@@ -664,16 +1129,17 @@ void test_latch( int n )
           , task_max_size
           , task_dependence );
 
-
   policy.spawn( policy.proc_create( TaskLatchRun<ExecSpace>(policy,n) ) );
 
   wait( policy );
 }
 
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 } // namespace TestTaskPolicy
 
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
 #endif /* #ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP */
 
 
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
index 810e74abdc..db6b0cff7e 100644
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -89,6 +89,34 @@ struct TestTeamPolicy {
       }
     }
 
+  // included for test_small_league_size
+  TestTeamPolicy()
+    : m_flags()
+  {}
+
+  // included for test_small_league_size
+  struct NoOpTag {} ;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const NoOpTag & , const team_member & member ) const
+    {}
+
+
+  static void test_small_league_size() {
+
+    int bs = 8; // batch size (number of elements per batch)
+    int ns = 16; // total number of "problems" to process
+
+    // calculate total scratch memory space size
+    const int level = 0;
+    int mem_size = 960;
+    const int num_teams = ns/bs;
+    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy(num_teams, Kokkos::AUTO());
+
+    Kokkos::parallel_for ( policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), Kokkos::PerThread(0))
+                         , TestTeamPolicy()
+                         );
+  }
+
   static void test_for( const size_t league_size )
     {
       TestTeamPolicy functor( league_size );
@@ -97,6 +125,8 @@ struct TestTeamPolicy {
 
       Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor );
       Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , VerifyInitTag >( league_size , team_size ) , functor );
+
+      test_small_league_size();
     }
 
   struct ReduceTag {};
@@ -617,7 +647,7 @@ struct TestScratchTeam {
     int team_scratch_size   = Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
                               Functor::shared_int_array_type::shmem_size(2*team_size);
     int thread_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_THREAD_COUNT);
-    Kokkos::parallel_reduce( team_exec.set_scratch_size(1,Kokkos::PerTeam(team_scratch_size),
+    Kokkos::parallel_reduce( team_exec.set_scratch_size(0,Kokkos::PerTeam(team_scratch_size),
                                                           Kokkos::PerThread(thread_scratch_size)) ,
                              Functor() , result_type( & error_count ) );
 
@@ -626,4 +656,255 @@ struct TestScratchTeam {
 };
 }
 
+namespace Test {
+template< class ExecSpace>
+KOKKOS_INLINE_FUNCTION
+int test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team1(team.team_scratch(0),128);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread1(team.thread_scratch(0),16);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team2(team.team_scratch(0),128);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread2(team.thread_scratch(0),16);
+
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team1(team.team_scratch(1),128000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread1(team.thread_scratch(1),16000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team2(team.team_scratch(1),128000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread2(team.thread_scratch(1),16000);
+
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team3(team.team_scratch(0),128);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread3(team.thread_scratch(0),16);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team3(team.team_scratch(1),128000);
+      Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread3(team.thread_scratch(1),16000);
+
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i) {
+        a_team1(i) = 1000000 + i;
+        a_team2(i) = 2000000 + i;
+        a_team3(i) = 3000000 + i;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i){
+        a_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
+        a_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
+        a_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+      });
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i) {
+        b_team1(i) = 1000000 + i;
+        b_team2(i) = 2000000 + i;
+        b_team3(i) = 3000000 + i;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i){
+        b_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
+        b_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
+        b_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+      });
+
+      team.team_barrier();
+      int error = 0;
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i) {
+        if(a_team1(i) != 1000000 + i) error++;
+        if(a_team2(i) != 2000000 + i) error++;
+        if(a_team3(i) != 3000000 + i) error++;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i){
+        if(a_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
+        if(a_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
+        if(a_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+      });
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i) {
+        if(b_team1(i) != 1000000 + i) error++;
+        if(b_team2(i) != 2000000 + i) error++;
+        if(b_team3(i) != 3000000 + i) error++;
+      });
+      team.team_barrier();
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i){
+        if(b_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
+        if(b_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
+        if( b_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+      });
+
+  return error;
+}
+
+
+struct TagReduce {};
+struct TagFor {};
+
+template< class ExecSpace, class ScheduleType >
+struct ClassNoShmemSizeFunction {
+  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
+    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+    errors() += error;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
+    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  }
+
+  void run() {
+    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    errors = d_errors;
+
+    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
+    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+
+    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
+    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+    {
+    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this);
+    Kokkos::fence();
+    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors = Kokkos::create_mirror_view(d_errors);
+    Kokkos::deep_copy(h_errors,d_errors);
+    ASSERT_EQ(h_errors(),0);
+    }
+
+    {
+    int error = 0;
+    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this,error);
+    Kokkos::fence();
+    ASSERT_EQ(error,0);
+    }
+  };
+};
+
+template< class ExecSpace, class ScheduleType >
+struct ClassWithShmemSizeFunction {
+  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
+    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+    errors() += error;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
+    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  }
+
+  void run() {
+    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    errors = d_errors;
+
+    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
+    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+    {
+    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_for(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this);
+    Kokkos::fence();
+    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(d_errors);
+    Kokkos::deep_copy(h_errors,d_errors);
+    ASSERT_EQ(h_errors(),0);
+    }
+
+    {
+    int error = 0;
+    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
+    Kokkos::parallel_reduce(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+      *this,error);
+    Kokkos::fence();
+    ASSERT_EQ(error,0);
+    }
+  };
+
+  unsigned team_shmem_size(int team_size) const {
+    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
+    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+    return per_team0 + team_size * per_thread0;
+  }
+};
+
+template< class ExecSpace, class ScheduleType >
+void test_team_mulit_level_scratch_test_lambda() {
+#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View<int,ExecSpace> d_errors("Errors");
+  errors = d_errors;
+
+  const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
+  const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+
+  const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
+  const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+
+  Kokkos::TeamPolicy<ExecSpace,ScheduleType> policy(10,8,16);
+  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
+    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+    errors() += error;
+  });
+  Kokkos::fence();
+  typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(errors);
+  Kokkos::deep_copy(h_errors,d_errors);
+  ASSERT_EQ(h_errors(),0);
+
+  int error = 0;
+  Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
+    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team, int& count) {
+      count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  },error);
+  ASSERT_EQ(error,0);
+  Kokkos::fence();
+#endif
+}
+
+
+}
+
+namespace {
+template< class ExecSpace, class ScheduleType >
+struct TestMultiLevelScratchTeam {
+
+  TestMultiLevelScratchTeam()
+  { run(); }
+
+  void run()
+  {
+#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+    Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
+#endif
+    Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
+    c1.run();
+
+    Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
+    c2.run();
+
+  }
+};
+}
+
+namespace Test {
+
+template< class ExecSpace >
+struct TestShmemSize {
+
+  TestShmemSize() { run(); }
+
+  void run()
+  {
+    typedef Kokkos::View< long***, ExecSpace > view_type;
+
+    size_t d1 = 5;
+    size_t d2 = 6;
+    size_t d3 = 7;
+
+    size_t size = view_type::shmem_size( d1, d2, d3 );
+
+    ASSERT_EQ( size, d1 * d2 * d3 * sizeof(long) );
+  }
+};
+}
+
 /*--------------------------------------------------------------------------*/
diff --git a/lib/kokkos/core/unit_test/TestThreads.cpp b/lib/kokkos/core/unit_test/TestThreads.cpp
index 03c7c44958..93049b95dd 100644
--- a/lib/kokkos/core/unit_test/TestThreads.cpp
+++ b/lib/kokkos/core/unit_test/TestThreads.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -66,6 +66,7 @@
 #include <TestViewSubview.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
 
 #include <TestReduce.hpp>
 #include <TestScan.hpp>
@@ -87,6 +88,8 @@
 
 #include <TestPolicyConstruction.hpp>
 
+#include <TestMDRange.hpp>
+
 namespace Test {
 
 class threads : public ::testing::Test {
@@ -112,7 +115,6 @@ protected:
     Kokkos::Threads::initialize( threads_count );
     Kokkos::Threads::finalize();
 
-    
     threads_count = std::max( 1u , numa_count * 2 )
                   * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
 
@@ -143,6 +145,12 @@ TEST_F( threads , init ) {
   ;
 }
 
+TEST_F( threads , md_range ) {
+  TestMDRange_2D< Kokkos::Threads >::test_for2(100,100);
+
+  TestMDRange_3D< Kokkos::Threads >::test_for3(100,100,100);
+}
+
 TEST_F( threads , dispatch )
 {
   const int repeat = 100 ;
@@ -235,6 +243,13 @@ TEST_F( threads, view_aggregate ) {
 
 TEST_F( threads , range_tag )
 {
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
+  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(2);
   TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
   TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
   TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
@@ -246,6 +261,10 @@ TEST_F( threads , range_tag )
 
 TEST_F( threads , team_tag )
 {
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
+  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
   TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
   TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
   TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
@@ -260,6 +279,14 @@ TEST_F( threads, double_reduce) {
   TestReduce< double ,   Kokkos::Threads >( 1000000 );
 }
 
+TEST_F( threads , reducers )
+{
+  TestReducers<int, Kokkos::Threads>::execute_integer();
+  TestReducers<size_t, Kokkos::Threads>::execute_integer();
+  TestReducers<double, Kokkos::Threads>::execute_float();
+  TestReducers<Kokkos::complex<double>, Kokkos::Threads>::execute_basic();
+}
+
 TEST_F( threads, team_long_reduce) {
   TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
   TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
@@ -291,13 +318,17 @@ TEST_F( threads, team_shared_request) {
   TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA) 
+#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
 TEST_F( threads, team_lambda_shared_request) {
   TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
+TEST_F( threads, shmem_size) {
+  TestShmemSize< Kokkos::Threads >();
+}
+
 TEST_F( threads , view_remap )
 {
   enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
@@ -382,6 +413,75 @@ TEST_F( threads , atomics )
   ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,3) ) );
 }
 
+TEST_F( threads , atomic_operations )
+{
+  const int start = 1; //Avoid zero for division
+  const int end = 11;
+  for (int i = start; i < end; ++i)
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 9 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 4 ) ) );
+  }
+
+}
+
 //----------------------------------------------------------------------------
 
 #if 0
@@ -434,7 +534,9 @@ TEST_F( threads , memory_pool )
   bool val = TestMemoryPool::test_mempool< Kokkos::Threads >( 128, 128000000 );
   ASSERT_TRUE( val );
 
-  TestMemoryPool::test_mempool2< Kokkos::Threads >( 128, 128000000 );
+  TestMemoryPool::test_mempool2< Kokkos::Threads >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Threads >();
 }
 
 //----------------------------------------------------------------------------
@@ -478,6 +580,8 @@ TEST_F( threads , team_vector )
   ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) );
 }
 
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
 TEST_F( threads , task_policy )
 {
   TestTaskPolicy::test_task_dep< Kokkos::Threads >( 10 );
@@ -503,6 +607,8 @@ TEST_F( threads , task_latch )
   TestTaskPolicy::test_latch< Kokkos::Threads >(1000);
 }
 
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
 } // namespace Test
 
 #endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
index 60c87472df..ae4c6d2185 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -63,7 +63,9 @@ size_t allocation_count( const Kokkos::View<T,P...> & view )
   const size_t card  = view.size();
   const size_t alloc = view.span();
 
-  return card <= alloc ? alloc : 0 ;
+  const int memory_span = Kokkos::View<int*>::required_allocation_size(100);
+
+  return (card <= alloc && memory_span == 400) ? alloc : 0 ;
 }
 
 #else
diff --git a/lib/kokkos/example/fenl/CGSolve.hpp b/lib/kokkos/example/fenl/CGSolve.hpp
index 370dee15ac..06a0030e09 100644
--- a/lib/kokkos/example/fenl/CGSolve.hpp
+++ b/lib/kokkos/example/fenl/CGSolve.hpp
@@ -245,8 +245,8 @@ void cgsolve( const ImportType & import
   norm_res  = sqrt( old_rdot );
   iteration = 0 ;
 
-  Kokkos::Impl::Timer wall_clock ;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer wall_clock ;
+  Kokkos::Timer timer;
 
   while ( tolerance < norm_res && iteration < maximum_iteration ) {
 
diff --git a/lib/kokkos/example/fenl/fenl_functors.hpp b/lib/kokkos/example/fenl/fenl_functors.hpp
index 30f5274a51..3020c99a2f 100644
--- a/lib/kokkos/example/fenl/fenl_functors.hpp
+++ b/lib/kokkos/example/fenl/fenl_functors.hpp
@@ -138,7 +138,7 @@ public:
       //--------------------------------
       // Guess at capacity required for the map:
 
-      Kokkos::Impl::Timer wall_clock ;
+      Kokkos::Timer wall_clock ;
 
       wall_clock.reset();
       phase = FILL_NODE_SET ;
diff --git a/lib/kokkos/example/fenl/fenl_impl.hpp b/lib/kokkos/example/fenl/fenl_impl.hpp
index 9c57da2989..64070ce55f 100644
--- a/lib/kokkos/example/fenl/fenl_impl.hpp
+++ b/lib/kokkos/example/fenl/fenl_impl.hpp
@@ -312,7 +312,7 @@ Perf fenl(
 
   //------------------------------------
 
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
 
   Perf perf_stats = Perf() ;
 
diff --git a/lib/kokkos/example/global_2_local_ids/G2L.hpp b/lib/kokkos/example/global_2_local_ids/G2L.hpp
index d4198c61ac..9023ae0426 100644
--- a/lib/kokkos/example/global_2_local_ids/G2L.hpp
+++ b/lib/kokkos/example/global_2_local_ids/G2L.hpp
@@ -186,7 +186,7 @@ size_t test_global_to_local_ids(unsigned num_ids, unsigned capacity, unsigned nu
   typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
 
   double elasped_time = 0;
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   local_id_view local_2_global("local_ids", num_ids);
   global_id_view global_2_local(capacity);
diff --git a/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp b/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp
index 837c74038c..ca819e4f97 100644
--- a/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp
+++ b/lib/kokkos/example/ichol/example/example_chol_performance_device.hpp
@@ -76,7 +76,7 @@ namespace Tacho {
 
     int r_val = 0;
 
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
     double
       t_import = 0.0,
       t_reorder = 0.0,
diff --git a/lib/kokkos/example/md_skeleton/main.cpp b/lib/kokkos/example/md_skeleton/main.cpp
index 06287bc609..58cf76cab0 100644
--- a/lib/kokkos/example/md_skeleton/main.cpp
+++ b/lib/kokkos/example/md_skeleton/main.cpp
@@ -76,7 +76,7 @@ int main(int argc, char** argv) {
   int iter = 100;
 
   /* Default value for system size (4*nx*ny*nz atoms)
-   * nx, ny and nz are set to system_size if not specififed on commandline */
+   * nx, ny and nz are set to system_size if not specified on commandline */
 
   int system_size = 20;
   int nx = -1;
@@ -191,7 +191,7 @@ int main(int argc, char** argv) {
 
   printf("-> Running %i force calculations\n",iter);
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   for(int i=0;i<iter;i++) {
     force(system,0);
diff --git a/lib/kokkos/example/multi_fem/Explicit.hpp b/lib/kokkos/example/multi_fem/Explicit.hpp
index ddeb53ae61..cef1a37a1a 100644
--- a/lib/kokkos/example/multi_fem/Explicit.hpp
+++ b/lib/kokkos/example/multi_fem/Explicit.hpp
@@ -127,7 +127,7 @@ PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
 
   PerformanceData perf_data ;
 
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
 
   //------------------------------------
   // Generate fields
diff --git a/lib/kokkos/example/multi_fem/Implicit.hpp b/lib/kokkos/example/multi_fem/Implicit.hpp
index 0017cb8e88..53f602f11a 100644
--- a/lib/kokkos/example/multi_fem/Implicit.hpp
+++ b/lib/kokkos/example/multi_fem/Implicit.hpp
@@ -154,7 +154,7 @@ PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
 
   typename graph_factory::element_map_type element_map ;
 
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
 
   //------------------------------------
   // Generate sparse matrix graph and element->graph map.
diff --git a/lib/kokkos/example/multi_fem/Nonlinear.hpp b/lib/kokkos/example/multi_fem/Nonlinear.hpp
index 96a05b97a9..1d243395c2 100644
--- a/lib/kokkos/example/multi_fem/Nonlinear.hpp
+++ b/lib/kokkos/example/multi_fem/Nonlinear.hpp
@@ -243,7 +243,7 @@ PerformanceData run( const typename FixtureType::FEMeshType & mesh ,
   //------------------------------------
   // Generate mesh and corresponding sparse matrix graph
 
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
 
   //------------------------------------
   // Generate sparse matrix graph and element->graph map.
diff --git a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
index 6ab42da50c..8d140b6d25 100644
--- a/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
+++ b/lib/kokkos/example/multi_fem/SparseLinearSystem.hpp
@@ -243,7 +243,7 @@ void cgsolve(
   normr     = sqrt( old_rdot );
   iteration = 0 ;
 
-  Kokkos::Impl::Timer wall_clock ;
+  Kokkos::Timer wall_clock ;
 
   while ( tolerance < normr && iteration < maximum_iteration ) {
 
diff --git a/lib/kokkos/example/sort_array/CMakeLists.txt b/lib/kokkos/example/sort_array/CMakeLists.txt
index 3e58198d7b..0c7da74f4a 100644
--- a/lib/kokkos/example/sort_array/CMakeLists.txt
+++ b/lib/kokkos/example/sort_array/CMakeLists.txt
@@ -1,4 +1,3 @@
-INCLUDE(TribitsAddExecutableAndTest)   
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/lib/kokkos/example/sort_array/sort_array.hpp b/lib/kokkos/example/sort_array/sort_array.hpp
index 018b1ee8e8..d21f998958 100644
--- a/lib/kokkos/example/sort_array/sort_array.hpp
+++ b/lib/kokkos/example/sort_array/sort_array.hpp
@@ -116,7 +116,7 @@ void sort_array( const size_t array_length /* length of spans of array to sort *
 
 #endif
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   const device_array_type  work_array("work_array" , array_length );
   const host_array_type    host_array("host_array" , total_length );
diff --git a/lib/kokkos/example/tutorial/01_hello_world/Makefile b/lib/kokkos/example/tutorial/01_hello_world/Makefile
index 38fb1b8f86..78a9fed0cc 100644
--- a/lib/kokkos/example/tutorial/01_hello_world/Makefile
+++ b/lib/kokkos/example/tutorial/01_hello_world/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
index bd2371382a..95ee2c47fe 100644
--- a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
index 38fb1b8f86..78a9fed0cc 100644
--- a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
+++ b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
index bd2371382a..95ee2c47fe 100644
--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile
index 38fb1b8f86..78a9fed0cc 100644
--- a/lib/kokkos/example/tutorial/03_simple_view/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
index bd2371382a..95ee2c47fe 100644
--- a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
index 38fb1b8f86..78a9fed0cc 100644
--- a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
+++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
index 38fb1b8f86..78a9fed0cc 100644
--- a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
+++ b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
index e61e8af59b..8406c504c9 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -142,7 +142,7 @@ int main (int narg, char* arg[]) {
   // Measure time to execute the contraction kernel when giving it a
   // LayoutLeft view for v1 and a LayoutRight view for v2. This should be
   // fast on GPUs and slow on CPUs
-  Kokkos::Impl::Timer time1;
+  Kokkos::Timer time1;
   Kokkos::parallel_for(size,contraction<left_type,right_type>(a,l,r));
   Kokkos::fence();
   double sec1 = time1.seconds();
@@ -154,7 +154,7 @@ int main (int narg, char* arg[]) {
   // Measure time to execute the contraction kernel when giving it a
   // LayoutRight view for v1 and a LayoutLeft view for v2. This should be
   // fast on CPUs and slow on GPUs
-  Kokkos::Impl::Timer time2;
+  Kokkos::Timer time2;
   Kokkos::parallel_for(size,contraction<right_type,left_type>(a,r,l));
   Kokkos::fence();
   double sec2 = time2.seconds();
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
index 8317c78bc9..ddd28a97c3 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -124,12 +124,12 @@ int main(int narg, char* arg[]) {
   // Run the localsum functor using the RandomAccess trait. On CPUs there should
   // not be any different in performance to not using the RandomAccess trait.
   // On GPUs where can be a dramatic difference
-  Kokkos::Impl::Timer time1;
+  Kokkos::Timer time1;
   Kokkos::parallel_for(size,localsum<view_type,view_type_rnd>(idx,dest,src));
   Kokkos::fence();
   double sec1 = time1.seconds();
 
-  Kokkos::Impl::Timer time2;
+  Kokkos::Timer time2;
   Kokkos::parallel_for(size,localsum<view_type,view_type>(idx,dest,src));
   Kokkos::fence();
   double sec2 = time2.seconds();
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
index 62ddb9c18a..4905e4bf88 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -87,9 +87,9 @@ struct localsum {
   // For example, the const_data_type version of double** is const
   // double**.
   Kokkos::View<idx_type::const_data_type, idx_type::array_layout, memory_space> idx;
-  // "array_intrinsic_type" is a typedef in ViewTraits (and DualView) which is the
+  // "scalar_array_type" is a typedef in ViewTraits (and DualView) which is the
   // array version of the value(s) stored in the View.
-  Kokkos::View<view_type::array_intrinsic_type, view_type::array_layout, memory_space> dest;
+  Kokkos::View<view_type::scalar_array_type, view_type::array_layout, memory_space> dest;
   Kokkos::View<view_type::const_data_type, view_type::array_layout,
                memory_space, Kokkos::MemoryRandomAccess> src;
 
@@ -150,6 +150,9 @@ protected:
 int main (int narg, char* arg[]) {
   Kokkos::initialize (narg, arg);
 
+// If View is non-trivial constructible type then add braces so it is out of scope
+// before Kokkos::finalize() call
+{
   ParticleTypes test("Test");
   Kokkos::fence();
   test.h_view(0) = ParticleType(-1e4,1);
@@ -182,7 +185,7 @@ int main (int narg, char* arg[]) {
 
   // Run on the device.  This will cause a sync of idx to the device,
   // since it was marked as modified on the host.
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
   Kokkos::fence();
   double sec1_dev = timer.seconds();
@@ -208,6 +211,7 @@ int main (int narg, char* arg[]) {
 
   printf("Device Time with Sync: %f without Sync: %f \n",sec1_dev,sec2_dev);
   printf("Host   Time with Sync: %f without Sync: %f \n",sec1_host,sec2_host);
+}
 
   Kokkos::finalize();
 }
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
index a7d460a1cc..cf5326b687 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -97,7 +97,7 @@ int main(int narg, char* arg[]) {
   Kokkos::fence();
   // Run on the device
   // This will cause a sync of idx to the device since it was modified on the host
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   Kokkos::parallel_for(size,localsum<view_type::execution_space>(idx,dest,src));
   Kokkos::fence();
   double sec1_dev = timer.seconds();
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
index 06955b3641..60a514f4d5 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
+++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3 --default-stream per-thread 
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
index 8c7e26c850..5da3bf76c9 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -116,7 +116,7 @@ int main(int argc, char * argv[]) {
   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(0.0,d_a));
   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),FillDevice(1.3513,d_b));
   Kokkos::fence();
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::Cuda>(0,size),ComputeADevice(20,d_a,d_b));
 
   if(synch==1)
diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
+++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
index 52816333c2..3e6175a756 100644
--- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
+++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -122,7 +122,7 @@ int main(int argc, char* args[]) {
   Kokkos::DualView<uint64_t*> vals("Vals",size*samples);
 
   // Run some performance comparisons
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   Kokkos::parallel_for(size,generate_random<Kokkos::Random_XorShift64_Pool<> >(vals.d_view,rand_pool64,samples));
   Kokkos::fence();
 
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
index 3d056537c3..965b72b4e9 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile
@@ -5,13 +5,14 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
 EXE = $(SRC:.cpp=.cuda)
 KOKKOS_DEVICES = "Cuda,OpenMP"
 KOKKOS_ARCH = "SNB,Kepler35"
+KOKKOS_CUDA_OPTIONS = "enable_lambda"
 else
 CXX = g++
 CXXFLAGS = -O3
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
index 5d081bf62b..565dd22e82 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -62,7 +62,8 @@ int main (int narg, char* args[]) {
 
   // Set up a policy that launches 12 teams, with the maximum number
   // of threads per team.
-  const team_policy policy (12, team_policy::team_size_max ( [=]{} ));
+
+  const team_policy policy (12, Kokkos::AUTO);
 
   // This is a reduction with a team policy.  The team policy changes
   // the first argument of the lambda.  Rather than an integer index
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
index 0eac4309a9..99d5958edf 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/vectorization.cpp
@@ -141,11 +141,11 @@ int main(int narg, char* args[]) {
 
   // Each team handles a slice of the data
   // Set up TeamPolicy with 512 teams with maximum number of threads per team and 16 vector lanes.
-  // The team_size_max function will determine the maximum number of threads taking into account
-  // shared memory requirements of the Functor.
+  // Kokkos::AUTO will determine the number of threads
   // The maximum vector length is hardware dependent but can always be smaller than the hardware allows.
   // The vector length must be a power of 2.
-  const Kokkos::TeamPolicy<> policy( 512 , Kokkos::TeamPolicy<>::team_size_max(SomeCorrelation(data,gsum)) , 16);
+
+  const Kokkos::TeamPolicy<> policy( 512 , Kokkos::AUTO , 16);
 
   Kokkos::parallel_for( policy , SomeCorrelation(data,gsum) );
 
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
index 3d056537c3..12ad36b31e 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile
@@ -5,7 +5,7 @@ default: build
 	echo "Start Build"
 
 ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
-CXX = nvcc_wrapper
+CXX = ../../../../config/nvcc_wrapper
 CXXFLAGS = -O3
 LINK = ${CXX}
 LINKFLAGS = 
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
index a9b20da1ae..c12b11d04d 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -117,7 +117,7 @@ int main(int narg, char* args[]) {
   Kokkos::DualView<int**> histogram("histogram",TEAM_SIZE,TEAM_SIZE);
 
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   // threads/team is automatically limited to maximum supported by the device.
   Kokkos::parallel_for( team_policy( nchunks , TEAM_SIZE )
                       , find_2_tuples(chunk_size,data,histogram) );
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
index 016e5dfe35..86f136da96 100755
--- a/lib/kokkos/generate_makefile.bash
+++ b/lib/kokkos/generate_makefile.bash
@@ -107,7 +107,7 @@ case $key in
     exit 0
     ;;
     *)
-            # unknown option
+    echo "warning: ignoring unknown option $key"
     ;;
 esac
 shift
diff --git a/src/MANYBODY/pair_vashishta.cpp b/src/MANYBODY/pair_vashishta.cpp
index aa030540f4..19f6017907 100755
--- a/src/MANYBODY/pair_vashishta.cpp
+++ b/src/MANYBODY/pair_vashishta.cpp
@@ -541,7 +541,7 @@ void PairVashishta::setup_params()
 /* ---------------------------------------------------------------------- */
 
 void PairVashishta::twobody(Param *param, double rsq, double &fforce,
-                     int eflag, double &eng)
+                            int eflag, double &eng)
 {
   double r,rinvsq,r4inv,r6inv,reta,lam1r,lam4r,vc2,vc3;
 
diff --git a/src/MANYBODY/pair_vashishta.h b/src/MANYBODY/pair_vashishta.h
index cdd2da3471..87077011e6 100755
--- a/src/MANYBODY/pair_vashishta.h
+++ b/src/MANYBODY/pair_vashishta.h
@@ -17,8 +17,8 @@ PairStyle(vashishta,PairVashishta)
 
 #else
 
-#ifndef LMP_PAIR_Vashishta_H
-#define LMP_PAIR_Vashishta_H
+#ifndef LMP_PAIR_VASHISHITA_H
+#define LMP_PAIR_VASHISHITA_H
 
 #include "pair.h"
 
@@ -29,10 +29,10 @@ class PairVashishta : public Pair {
   PairVashishta(class LAMMPS *);
   virtual ~PairVashishta();
   virtual void compute(int, int);
-  void settings(int, char **);
+  virtual void settings(int, char **);
   void coeff(int, char **);
-  virtual double init_one(int, int);
-  virtual void init_style();
+  double init_one(int, int);
+  void init_style();
 
  protected:
   struct Param {
@@ -55,9 +55,9 @@ class PairVashishta : public Pair {
   int maxparam;                 // max # of parameter sets
   Param *params;                // parameter set for an I-J-K interaction
 
-  virtual void allocate();
+  void allocate();
   void read_file(char *);
-  void setup_params();
+  virtual void setup_params();
   void twobody(Param *, double, double &, int, double &);
   void threebody(Param *, Param *, Param *, double, double, double *, double *,
                  double *, double *, int, double &);
diff --git a/src/fix_nve_sphere.cpp b/src/fix_nve_sphere.cpp
index 9f7b4a9eaa..42cca31f6b 100644
--- a/src/fix_nve_sphere.cpp
+++ b/src/fix_nve_sphere.cpp
@@ -133,12 +133,13 @@ void FixNVESphere::initial_integrate(int vflag)
   
   // update mu for dipoles
   
-
   if (extra == DIPOLE) {
     double **mu = atom->mu;
     if (dlm == NODLM) {
+
       // d_mu/dt = omega cross mu
       // renormalize mu to dipole length
+
       for (int i = 0; i < nlocal; i++)
         if (mask[i] & groupbit)
           if (mu[i][3] > 0.0) {
@@ -152,7 +153,9 @@ void FixNVESphere::initial_integrate(int vflag)
             mu[i][2] = g[2]*scale;
           }
     } else {
-      // Integrate orientation following Dullweber-Leimkuhler-Maclachlan scheme
+
+      // integrate orientation following Dullweber-Leimkuhler-Maclachlan scheme
+
       for (int i = 0; i < nlocal; i++) {
         if (mask[i] & groupbit && mu[i][3] > 0.0) {
           
@@ -160,8 +163,9 @@ void FixNVESphere::initial_integrate(int vflag)
           // Q is the rotation matrix from space frame to body frame
           // i.e. v_b = Q.v_s
           
-          // Define mu to lie along the z axis in the body frame
-          // We take the unit dipole to avoid getting a scaling matrix
+          // define mu to lie along the z axis in the body frame
+          // take the unit dipole to avoid getting a scaling matrix
+
           inv_len_mu = 1.0/mu[i][3];
           a[0] = mu[i][0]*inv_len_mu;
           a[1] = mu[i][1]*inv_len_mu;
@@ -180,9 +184,15 @@ void FixNVESphere::initial_integrate(int vflag)
           if (s2 != 0.0){ // i.e. the vectors are not parallel
             scale = (1.0 - a[2])/s2;
             
-            Q[0][0] = 1.0 - scale*a[0]*a[0]; Q[0][1] = -scale*a[0]*a[1];      Q[0][2] = -a[0];
-            Q[1][0] = -scale*a[0]*a[1];      Q[1][1] = 1.0 - scale*a[1]*a[1]; Q[1][2] = -a[1];
-            Q[2][0] = a[0];                  Q[2][1] = a[1];                  Q[2][2] = 1.0 - scale*(a[0]*a[0] + a[1]*a[1]);
+            Q[0][0] = 1.0 - scale*a[0]*a[0]; 
+            Q[0][1] = -scale*a[0]*a[1];
+            Q[0][2] = -a[0];
+            Q[1][0] = -scale*a[0]*a[1];
+            Q[1][1] = 1.0 - scale*a[1]*a[1];
+            Q[1][2] = -a[1];
+            Q[2][0] = a[0];
+            Q[2][1] = a[1];
+            Q[2][2] = 1.0 - scale*(a[0]*a[0] + a[1]*a[1]);
           } else { // if parallel then we just have I or -I
             Q[0][0] = 1.0/a[2];  Q[0][1] = 0.0;       Q[0][2] = 0.0;
             Q[1][0] = 0.0;       Q[1][1] = 1.0/a[2];  Q[1][2] = 0.0;
@@ -242,7 +252,9 @@ void FixNVESphere::initial_integrate(int vflag)
           
           // Transform w back into space frame w_temp = Q^T.w
           transpose_matvec(Q_temp,w,w_temp);
-          omega[i][0] = w_temp[0]; omega[i][1] = w_temp[1]; omega[i][2] = w_temp[2];
+          omega[i][0] = w_temp[0]; 
+          omega[i][1] = w_temp[1];
+          omega[i][2] = w_temp[2];
           
           // Set dipole according to updated Q: mu = Q^T.[0 0 1] * |mu|
           mu[i][0] = Q_temp[2][0] * mu[i][3];
@@ -289,7 +301,8 @@ void FixNVESphere::final_integrate()
       omega[i][0] += dtirotate * torque[i][0];
       omega[i][1] += dtirotate * torque[i][1];
       omega[i][2] += dtirotate * torque[i][2];
-      rke += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + omega[i][2]*omega[i][2])*radius[i]*radius[i]*rmass[i];
+      rke += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + 
+              omega[i][2]*omega[i][2])*radius[i]*radius[i]*rmass[i];
     }
   
 }
diff --git a/src/group.cpp b/src/group.cpp
index da0e94fc11..973fcbdcce 100644
--- a/src/group.cpp
+++ b/src/group.cpp
@@ -1665,46 +1665,93 @@ void Group::inertia(int igroup, double *cm, double itensor[3][3], int iregion)
 
 /* ----------------------------------------------------------------------
    compute angular velocity omega from L and I
-   really not a group/region operation, but L,I were computed for a group/region
-   diagonalize I instead of inverting it, to allow for a singular matrix
 ------------------------------------------------------------------------- */
 
 void Group::omega(double *angmom, double inertia[3][3], double *w)
 {
   double idiag[3],ex[3],ey[3],ez[3],cross[3];
-  double evectors[3][3];
-  
-  int ierror = MathExtra::jacobi(inertia,idiag,evectors);
-  if (ierror) error->all(FLERR,
-                         "Insufficient Jacobi rotations for group::omega");
-
-  ex[0] = evectors[0][0];
-  ex[1] = evectors[1][0];
-  ex[2] = evectors[2][0];
-  ey[0] = evectors[0][1];
-  ey[1] = evectors[1][1];
-  ey[2] = evectors[2][1];
-  ez[0] = evectors[0][2];
-  ez[1] = evectors[1][2];
-  ez[2] = evectors[2][2];
-  
-  // enforce 3 evectors as a right-handed coordinate system
-  // flip 3rd vector if needed
-  
-  MathExtra::cross3(ex,ey,cross);
-  if (MathExtra::dot3(cross,ez) < 0.0) MathExtra::negate3(ez);
-  
-  // if any principal moment < scaled EPSILON, set to 0.0
+  double evectors[3][3],inverse[3][3];
+
+  // determinant = triple product of rows of inertia matrix
+
+  double determinant = inertia[0][0]*inertia[1][1]*inertia[2][2] +
+    inertia[0][1]*inertia[1][2]*inertia[2][0] +
+    inertia[0][2]*inertia[1][0]*inertia[2][1] -
+    inertia[0][0]*inertia[1][2]*inertia[2][1] -
+    inertia[0][1]*inertia[1][0]*inertia[2][2] -
+    inertia[2][0]*inertia[1][1]*inertia[0][2];
+
+  // non-singular I matrix
+  // use L = Iw, inverting I to solve for w
+  // this should give exact zeroing of angular momentum by velocity command
+
+  if (determinant > EPSILON) {
+
+    inverse[0][0] = inertia[1][1]*inertia[2][2] - inertia[1][2]*inertia[2][1];
+    inverse[0][1] = -(inertia[0][1]*inertia[2][2] - 
+                      inertia[0][2]*inertia[2][1]);
+    inverse[0][2] = inertia[0][1]*inertia[1][2] - inertia[0][2]*inertia[1][1];
+
+    inverse[1][0] = -(inertia[1][0]*inertia[2][2] - 
+                      inertia[1][2]*inertia[2][0]);
+    inverse[1][1] = inertia[0][0]*inertia[2][2] - inertia[0][2]*inertia[2][0];
+    inverse[1][2] = -(inertia[0][0]*inertia[1][2] - 
+                      inertia[0][2]*inertia[1][0]);
+
+    inverse[2][0] = inertia[1][0]*inertia[2][1] - inertia[1][1]*inertia[2][0];
+    inverse[2][1] = -(inertia[0][0]*inertia[2][1] - 
+                      inertia[0][1]*inertia[2][0]);
+    inverse[2][2] = inertia[0][0]*inertia[1][1] - inertia[0][1]*inertia[1][0];
+
+    for (int i = 0; i < 3; i++)
+      for (int j = 0; j < 3; j++)
+        inverse[i][j] /= determinant;
+
+    w[0] = inverse[0][0]*angmom[0] + inverse[0][1]*angmom[1] +
+      inverse[0][2]*angmom[2];
+    w[1] = inverse[1][0]*angmom[0] + inverse[1][1]*angmom[1] +
+      inverse[1][2]*angmom[2];
+    w[2] = inverse[2][0]*angmom[0] + inverse[2][1]*angmom[1] +
+      inverse[2][2]*angmom[2];
+
+  // handle (nearly) singular I matrix
+  // typically due to 2-atom group or linear molecule
+  // use jacobi() and angmom_to_omega() to calculate valid omega
+  // less exact answer than matrix inversion, due to iterative Jacobi method
+
+  } else {
+    int ierror = MathExtra::jacobi(inertia,idiag,evectors);
+    if (ierror) error->all(FLERR,
+                           "Insufficient Jacobi rotations for group::omega");
+
+    ex[0] = evectors[0][0];
+    ex[1] = evectors[1][0];
+    ex[2] = evectors[2][0];
+    ey[0] = evectors[0][1];
+    ey[1] = evectors[1][1];
+    ey[2] = evectors[2][1];
+    ez[0] = evectors[0][2];
+    ez[1] = evectors[1][2];
+    ez[2] = evectors[2][2];
   
-  double max;
-  max = MAX(idiag[0],idiag[1]);
-  max = MAX(max,idiag[2]);
+    // enforce 3 evectors as a right-handed coordinate system
+    // flip 3rd vector if needed
   
-  if (idiag[0] < EPSILON*max) idiag[0] = 0.0;
-  if (idiag[1] < EPSILON*max) idiag[1] = 0.0;
-  if (idiag[2] < EPSILON*max) idiag[2] = 0.0;
+    MathExtra::cross3(ex,ey,cross);
+    if (MathExtra::dot3(cross,ez) < 0.0) MathExtra::negate3(ez);
   
-  // calculate omega using diagonalized inertia matrix
+    // if any principal moment < scaled EPSILON, set to 0.0
   
-  MathExtra::angmom_to_omega(angmom,ex,ey,ez,idiag,w);
+    double max;
+    max = MAX(idiag[0],idiag[1]);
+    max = MAX(max,idiag[2]);
+    
+    if (idiag[0] < EPSILON*max) idiag[0] = 0.0;
+    if (idiag[1] < EPSILON*max) idiag[1] = 0.0;
+    if (idiag[2] < EPSILON*max) idiag[2] = 0.0;
+    
+    // calculate omega using diagonalized inertia matrix
+    
+    MathExtra::angmom_to_omega(angmom,ex,ey,ez,idiag,w);
+  }
 }
-- 
GitLab